Merge branch 'FDecaYed-deyuf/fused_optimizer_v2'

15648029 · Michael Carilli · 880ab925 · b9f0995b · 15648029 · 15648029
Commit 15648029 authored Aug 26, 2019 by Michael Carilli
20 changed files
--- a/apex/contrib/xentropy/softmax_xentropy.py
+++ b/apex/contrib/xentropy/softmax_xentropy.py
+import torch
+import xentropy_cuda
+
+class SoftmaxCrossEntropyLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to_float=False):
+        losses, max_log_sum_exp = xentropy_cuda.forward(
+            logits, labels, smoothing, half_to_float)
+        losses.masked_fill_(labels==padding_idx, 0)
+
+        ctx.save_for_backward(logits, max_log_sum_exp, labels,
+            torch.FloatTensor([smoothing]),
+            torch.LongTensor([padding_idx]))
+
+        return losses
+
+    @staticmethod
+    def backward(ctx, grad_loss):
+        logits, max_log_sum_exp, labels, smoothing, padding_idx = ctx.saved_tensors
+
+        if not grad_loss.is_contiguous():
+            grad_loss = grad_loss.contiguous()
+        grad_loss.masked_fill_(labels==padding_idx.item(), 0)
+        grad_logits = xentropy_cuda.backward(
+            grad_loss.contiguous(), logits, max_log_sum_exp,
+            labels, smoothing.item())
+
+        return grad_logits, None, None, None, None
--- a/apex/optimizers/__init__.py
+++ b/apex/optimizers/__init__.py
+from .fused_sgd import FusedSGD
 from .fused_adam import FusedAdam
+from .fused_novograd import FusedNovoGrad
+from .fused_lamb import FusedLAMB
 from .fp16_optimizer import FP16_Optimizer
--- a/apex/optimizers/fp16_optimizer.py
+++ b/apex/optimizers/fp16_optimizer.py
@@ -35,6 +35,8 @@ class FP16_Optimizer(object):
                 dynamic_loss_args=None,
                 verbose=True):

+        print("\nfp16_optimizer is designed to only work with apex.optimizers, and will be removed in future")
+        print("To update, use updated optimizers with AMP.")
        # The fused optimizer does all the work. We need this layer for two reason:
        # 1. maintain same user API from apex.fp16_utils
        # 2. keep common stuff here in case we need to add new fused optimizer later

--- a/apex/optimizers/fused_adam.py
+++ b/apex/optimizers/fused_adam.py
-import types
 import torch
-import importlib
+from apex.multi_tensor_apply import multi_tensor_applier
+from amp_C import multi_tensor_adam

 class FusedAdam(torch.optim.Optimizer):

    """Implements Adam algorithm. Currently GPU-only.  Requires Apex to be installed via
    ``python setup.py install --cuda_ext --cpp_ext``.

+    This version of fused adam implements 2 fusion:
+      - Fusion of operations within adam optimizer
+      - Apply operation on a list of tensor in single multi-tensor kernel by group
+    It is a breaking change over last version, as API changes and it no longer fuse grad norm and loss scaling.
+
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.

    Arguments:
@@ -21,10 +26,8 @@ class FusedAdam(torch.optim.Optimizer):
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False) NOT SUPPORTED in FusedAdam!
-        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
-            adds eps to the bias-corrected second moment estimate before
-            evaluating square root instead of adding it to the square root of
-            second moment estimate as in the original paper. (default: False)
+        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
+            True for decoupled weight decay(also known as AdamW) (default: True)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
@@ -32,116 +35,75 @@ class FusedAdam(torch.optim.Optimizer):
        https://openreview.net/forum?id=ryQu7f-RZ
    """

-    def __init__(self, params,
-                 lr=1e-3, bias_correction = True,
-                 betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
-                 weight_decay=0., max_grad_norm=0., amsgrad=False):
-        global fused_adam_cuda
-        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+    def __init__(self, params, lr=1e-3, bias_correction=True,
+                 betas=(0.9, 0.999), eps=1e-8, adam_w_mode=True,
+                 weight_decay=0., amsgrad=False):

        if amsgrad:
            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
        defaults = dict(lr=lr, bias_correction=bias_correction,
-                        betas=betas, eps=eps, weight_decay=weight_decay,
-                        max_grad_norm=max_grad_norm)
+                        betas=betas, eps=eps, weight_decay=weight_decay)
        super(FusedAdam, self).__init__(params, defaults)
-        self.eps_mode = 0 if  eps_inside_sqrt else 1
+        self.adam_w_mode = 1 if adam_w_mode else 0
+        self.dummy_overflow_buf = torch.cuda.IntTensor([0])

-    def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
+    def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
-            grads (list of tensors, optional): weight gradient to use for the
-                optimizer update. If gradients have type torch.half, parameters
-                are expected to be in type torch.float. (default: None)
-            output params (list of tensors, optional): A reduced precision copy
-                of the updated weights written out in addition to the regular
-                updated weights. Have to be of same type as gradients. (default: None)
-            scale (float, optional): factor to divide gradient tensor values
-                by before applying to weights. (default: 1)
        """
+        if any(p is not None for p in [grads, output_params, scale, grad_norms]):
+            raise RuntimeError('FusedAdam has been updated, please use with AMP for mixed precision.')
        loss = None
        if closure is not None:
            loss = closure()

-        if grads is None:
-            grads_group = [None]*len(self.param_groups)
-        # backward compatibility
-        # assuming a list/generator of parameter means single group
-        elif isinstance(grads, types.GeneratorType):
-            grads_group = [grads]
-        elif type(grads[0])!=list:
-            grads_group = [grads]
-        else:
-            grads_group = grads
-
-        if output_params is None:
-            output_params_group = [None]*len(self.param_groups)
-        elif isinstance(output_params, types.GeneratorType):
-            output_params_group = [output_params]
-        elif type(output_params[0])!=list:
-            output_params_group = [output_params]
-        else:
-            output_params_group = output_params
-
-        if grad_norms is None:
-            grad_norms = [None]*len(self.param_groups)
-
-        for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms):
-            if grads_this_group is None:
-               grads_this_group = [None]*len(group['params'])
-            if output_params_this_group is None:
-               output_params_this_group = [None]*len(group['params'])
-
-            # compute combined scale factor for this group
-            combined_scale = scale
-            if group['max_grad_norm'] > 0:
-                # norm is in fact norm*scale
-                clip = ((grad_norm / scale) + 1e-6) / group['max_grad_norm']
-                if clip > 1:
-                    combined_scale = clip * scale
-
+        for group in self.param_groups:
            bias_correction = 1 if group['bias_correction'] else 0
+            beta1, beta2 = group['betas']
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            # create lists for multi-tensor apply
+            p_list, g_list, m1_list, m2_list = [], [], [], []

-            for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group):
-                #note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
-                if p.grad is None and grad is None:
+            for p in group['params']:
+                if p.grad is None:
                    continue
-                if grad is None:
-                    grad = p.grad.data
-                if grad.is_sparse:
+                if p.grad.data.is_sparse:
                    raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')

                state = self.state[p]
-
                # State initialization
                if len(state) == 0:
-                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-
-                out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param
-                fused_adam_cuda.adam(p.data,
-                                     out_p,
-                                     exp_avg,
-                                     exp_avg_sq,
-                                     grad,
-                                     group['lr'],
-                                     beta1,
-                                     beta2,
-                                     group['eps'],
-                                     combined_scale,
-                                     state['step'],
-                                     self.eps_mode,
-                                     bias_correction,
-                                     group['weight_decay'])
+                p_list.append(p.data)
+                g_list.append(p.grad.data)
+                m1_list.append(state['exp_avg'])
+                m2_list.append(state['exp_avg_sq'])
+
+            multi_tensor_applier(multi_tensor_adam,
+                                 self.dummy_overflow_buf,
+                                 [g_list, p_list, m1_list, m2_list],
+                                 group['lr'],
+                                 beta1,
+                                 beta2,
+                                 group['eps'],
+                                 group['step'],
+                                 self.adam_w_mode,
+                                 bias_correction,
+                                 group['weight_decay'])
+
+
        return loss
--- a/apex/optimizers/fused_lamb.py
+++ b/apex/optimizers/fused_lamb.py
+import torch
+from apex.multi_tensor_apply import multi_tensor_applier
+
+class FusedLAMB(torch.optim.Optimizer):
+
+    """Implements LAMB algorithm. Currently GPU-only.  Requires Apex to be installed via
+    ``python setup.py install --cuda_ext --cpp_ext``.
+
+    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its norm. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            NOT SUPPORTED now! (default: False)
+        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
+            True for decoupled weight decay(also known as AdamW) (default: True)
+        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
+            calculating running averages of gradient. (default: True)
+        set_grad_none (bool, optional): whether set grad to None when zero_grad()
+            method is called. (default: True)
+        max_grad_norm (float, optional): value used to clip global grad norm
+            (default: 1.0)
+
+    """
+
+    def __init__(self, params, lr=1e-3, bias_correction=True,
+                 betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01,
+                 amsgrad=False, adam_w_mode=True,
+                 grad_averaging=True, set_grad_none=True,
+                 max_grad_norm=1.0):
+        if amsgrad:
+            raise RuntimeError('FusedLAMB does not support the AMSGrad variant.')
+        defaults = dict(lr=lr, bias_correction=bias_correction,
+                        betas=betas, eps=eps, weight_decay=weight_decay,
+                        grad_averaging=grad_averaging,
+                        max_grad_norm=max_grad_norm)
+        super(FusedLAMB, self).__init__(params, defaults)
+        if multi_tensor_applier.available:
+            import amp_C
+            # Skip buffer
+            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+            self.multi_tensor_lamb = amp_C.multi_tensor_lamb
+        else:
+            raise RuntimeError('apex.optimizers.FusedLAMB requires cuda extensions')
+
+        self.adam_w_mode = 1 if adam_w_mode else 0
+        self.set_grad_none = set_grad_none
+
+    def zero_grad(self):
+        if self.set_grad_none:
+            for group in self.param_groups:
+                for p in group['params']:
+                    p.grad = None
+        else:
+            super(FusedLAMB, self).zero_grad()
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            bias_correction = 1 if group['bias_correction'] else 0
+            beta1, beta2 = group['betas']
+            grad_averaging = 1 if group['grad_averaging'] else 0
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            # create lists for multi-tensor apply
+            g_16, p_16, m_16, v_16 = [], [], [], []
+            g_32, p_32, m_32, v_32 = [], [], [], []
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.grad.data.is_sparse:
+                    raise RuntimeError('FusedLAMB does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                if p.dtype == torch.float16:
+                    g_16.append(p.grad.data)
+                    p_16.append(p.data)
+                    m_16.append(state['exp_avg'])
+                    v_16.append(state['exp_avg_sq'])
+                elif p.dtype == torch.float32:
+                    g_32.append(p.grad.data)
+                    p_32.append(p.data)
+                    m_32.append(state['exp_avg'])
+                    v_32.append(state['exp_avg_sq'])
+                else:
+                    raise RuntimeError('FusedLAMB only support fp16 and fp32.')
+
+            if(len(g_16) > 0):
+                multi_tensor_applier(self.multi_tensor_lamb,
+                                     self._dummy_overflow_buf,
+                                     [g_16, p_16, m_16, v_16],
+                                     group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     group['step'],
+                                     bias_correction,
+                                     group['weight_decay'],
+                                     grad_averaging,
+                                     self.adam_w_mode,
+                                     group['max_grad_norm'])
+            if(len(g_32) > 0):
+                multi_tensor_applier(self.multi_tensor_lamb,
+                                     self._dummy_overflow_buf,
+                                     [g_32, p_32, m_32, v_32],
+                                     group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     group['step'],
+                                     bias_correction,
+                                     group['weight_decay'],
+                                     grad_averaging,
+                                     self.adam_w_mode,
+                                     group['max_grad_norm'])
+
+        return loss
--- a/apex/optimizers/fused_novograd.py
+++ b/apex/optimizers/fused_novograd.py
+import torch
+from apex.multi_tensor_apply import multi_tensor_applier
+
+class FusedNovoGrad(torch.optim.Optimizer):
+
+    """Implements NovoGrad algorithm. Currently GPU-only.  Requires Apex to be installed via
+    ``python setup.py install --cuda_ext --cpp_ext``.
+
+    It has been proposed in `Jasper: An End-to-End Convolutional Neural Acoustic Model`_.
+    More info: https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html#novograd
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its norm. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            NOT SUPPORTED now! (default: False)
+        reg_inside_moment (bool, optional): whether do regularization (norm and L2)
+            in momentum calculation. True for include, False for not include and
+            only do it on update term. (default: False)
+        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
+            calculating running averages of gradient. (default: True)
+        norm_type (int, optional): which norm to calculate for each layer.
+            2 for L2 norm, and 0 for infinite norm. These 2 are only supported
+            type now. (default: 2)
+        init_zero (bool, optional): whether init norm with 0 (start averaging on
+            1st step) or first step norm (start averaging on 2nd step). True for
+            init with 0. (default: False)
+        set_grad_none (bool, optional): whether set grad to None when zero_grad()
+            method is called. (default: True)
+
+    .. _Jasper\: An End-to-End Convolutional Neural Acoustic Mode:
+        https://arxiv.org/abs/1904.03288
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, bias_correction=True,
+                 betas=(0.9, 0.999), eps=1e-8, weight_decay=0.,
+                 amsgrad=False, reg_inside_moment=False,
+                 grad_averaging=True, norm_type=2, init_zero=False,
+                 set_grad_none=True):
+        if amsgrad:
+            raise RuntimeError('FusedNovoGrad does not support the AMSGrad variant.')
+        defaults = dict(lr=lr, bias_correction=bias_correction,
+                        betas=betas, eps=eps, weight_decay=weight_decay,
+                        grad_averaging=grad_averaging, norm_type=norm_type,
+                        init_zero=init_zero)
+        super(FusedNovoGrad, self).__init__(params, defaults)
+        if multi_tensor_applier.available:
+            import amp_C
+            # Skip buffer
+            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+            self.multi_tensor_novograd = amp_C.multi_tensor_novograd
+        else:
+            raise RuntimeError('apex.optimizers.FusedNovoGrad requires cuda extensions')
+
+        self.moment_mode = 0 if reg_inside_moment else 1
+        self.set_grad_none = set_grad_none
+
+    def zero_grad(self):
+        if self.set_grad_none:
+            for group in self.param_groups:
+                for p in group['params']:
+                    p.grad = None
+        else:
+            super(FusedNovoGrad, self).zero_grad()
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            bias_correction = 1 if group['bias_correction'] else 0
+            beta1, beta2 = group['betas']
+            grad_averaging = 1 if group['grad_averaging'] else 0
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            # create lists for multi-tensor apply
+            g_16, p_16, m_16 = [], [], []
+            g_32, p_32, m_32 = [], [], []
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.grad.data.is_sparse:
+                    raise RuntimeError('FusedNovoGrad does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+
+                if p.dtype == torch.float16:
+                    g_16.append(p.grad.data)
+                    p_16.append(p.data)
+                    m_16.append(state['exp_avg'])
+                elif p.dtype == torch.float32:
+                    g_32.append(p.grad.data)
+                    p_32.append(p.data)
+                    m_32.append(state['exp_avg'])
+                else:
+                    raise RuntimeError('FusedNovoGrad only support fp16 and fp32.')
+
+            # we store per weight norm as one tensor for one group/precision combination
+            # different from optim.Adam, we store norm here(not ^2) so we can unify calculation for norm types
+            if 'exp_avg_sq' not in group:
+                group['exp_avg_sq'] = [None, None]
+                if group['init_zero']:
+                    group['exp_avg_sq'][0] = torch.cuda.FloatTensor(len(g_16)).contiguous().fill_(0)
+                    group['exp_avg_sq'][1] = torch.cuda.FloatTensor(len(g_32)).contiguous().fill_(0)
+                else: # init with first step norm, so first blend have no effect
+                    if group['norm_type'] == 0:
+                        v_16 = [torch.max(torch.abs(g.to(torch.float32))).item() for g in g_16]
+                        v_32 = [torch.max(torch.abs(g)).item() for g in g_32]
+                    elif group['norm_type'] == 2:
+                        v_16 = [torch.sum(torch.pow(g.to(torch.float32), 2)).sqrt().item() for g in g_16]
+                        v_32 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_32]
+                    else:
+                        raise RuntimeError('FusedNovoGrad only support l2/inf norm now.')
+                    group['exp_avg_sq'][0] = torch.cuda.FloatTensor(v_16)
+                    group['exp_avg_sq'][1] = torch.cuda.FloatTensor(v_32)
+            else:
+                assert(len(g_16) == group['exp_avg_sq'][0].numel())
+                assert(len(g_32) == group['exp_avg_sq'][1].numel())
+
+            if(len(g_16) > 0):
+                multi_tensor_applier(self.multi_tensor_novograd,
+                                     self._dummy_overflow_buf,
+                                     [g_16, p_16, m_16],
+                                     group['exp_avg_sq'][0],
+                                     group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     group['step'],
+                                     bias_correction,
+                                     group['weight_decay'],
+                                     grad_averaging,
+                                     self.moment_mode,
+                                     group['norm_type'])
+            if(len(g_32) > 0):
+                multi_tensor_applier(self.multi_tensor_novograd,
+                                     self._dummy_overflow_buf,
+                                     [g_32, p_32, m_32],
+                                     group['exp_avg_sq'][1],
+                                     group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     group['step'],
+                                     bias_correction,
+                                     group['weight_decay'],
+                                     grad_averaging,
+                                     self.moment_mode,
+                                     group['norm_type'])
+
+
+        return loss
--- a/apex/optimizers/fused_sgd.py
+++ b/apex/optimizers/fused_sgd.py
+import torch
+from torch.optim.optimizer import Optimizer, required
+
+from apex.multi_tensor_apply import multi_tensor_applier
+
+class FusedSGD(Optimizer):
+    r"""Implements stochastic gradient descent (optionally with momentum).
+
+    Nesterov momentum is based on the formula from
+    `On the importance of initialization and momentum in deep learning`__.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float): learning rate
+        momentum (float, optional): momentum factor (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+
+    Example:
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+
+    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
+
+    .. note::
+        The implementation of SGD with Momentum/Nesterov subtly differs from
+        Sutskever et. al. and implementations in some other frameworks.
+
+        Considering the specific case of Momentum, the update can be written as
+
+        .. math::
+                  v = \rho * v + g \\
+                  p = p - lr * v
+
+        where p, g, v and :math:`\rho` denote the parameters, gradient,
+        velocity, and momentum respectively.
+
+        This is in contrast to Sutskever et. al. and
+        other frameworks which employ an update of the form
+
+        .. math::
+             v = \rho * v + lr * g \\
+             p = p - v
+
+        The Nesterov version is analogously modified.
+    """
+
+    def __init__(self, params, lr=required, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False,
+                 wd_after_momentum=False,
+                 materialize_master_grads=True):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if momentum < 0.0:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if weight_decay < 0.0:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
+                        weight_decay=weight_decay, nesterov=nesterov)
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+        super(FusedSGD, self).__init__(params, defaults)
+
+        self.wd_after_momentum = wd_after_momentum
+        self.materialize_master_grads = materialize_master_grads
+        self.most_recent_scale = 1.0
+        self.scale_set_by_backward = False
+
+        if multi_tensor_applier.available:
+            import amp_C
+            # Skip buffer
+            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+            self.multi_tensor_sgd = amp_C.multi_tensor_sgd
+        else:
+            raise RuntimeError('apex.optimizers.FusedSGD requires cuda extensions')
+
+    def __setstate__(self, state):
+        super(FusedSGD, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+
+    def get_momentums(self, params):
+        momentums = []
+        first_run = True
+        for p in params:
+            param_state = self.state[p]
+            # torch.optim.SGD initializes momentum in the main loop, we have
+            # to do it here, and track whether or not we've done so, so that
+            # momentum application can be skipped in the main kernel.
+            if 'momentum_buffer' not in param_state:
+                first_run = True
+                buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
+                momentums.append(buf)
+            else:
+                first_run = False
+                momentums.append(param_state['momentum_buffer'])
+        return momentums, first_run
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        explicit_master_params = (hasattr(self, "_amp_stash") and
+                                  hasattr(self._amp_stash, "fp32_from_fp16_groups"))
+
+        for gid, group in enumerate(self.param_groups):
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            nesterov = group['nesterov']
+
+
+            # For each group, there are 3 possible combinations we need to consider:
+            # grad_type, param_to_update_type, momentum_type, requires_fp16_model_copy
+            # 1. fp16, fp16, fp16, No
+            # 2. fp32, fp32, fp32, No
+            # 3. fp16, fp32, fp32, Yes
+
+            first_runs = [True, True]
+
+            # I think a bit of code divergence in exchange for naming clarity is worthwhile
+            if explicit_master_params:
+                stash = self._amp_stash
+
+                fp32_params = [p for p in stash.fp32_from_fp32_groups[gid] if p.grad is not None]
+                fp32_grads = [p.grad for p in stash.fp32_from_fp32_groups[gid] if p.grad is not None]
+                fp32_momentums, first_runs[1] = self.get_momentums(fp32_params)
+
+                if self.materialize_master_grads:
+                    fp16_model_params = [p for i, p in enumerate(
+                        stash.fp16_groups[gid]) if stash.fp32_from_fp16_groups[gid][i].grad is not None]
+                    fp32_from_fp16_grads = [p.grad for p in stash.fp32_from_fp16_groups[gid] if p.grad is not None]
+                    fp32_from_fp16_params = [p for p in stash.fp32_from_fp16_groups[gid] if p.grad is not None]
+                    fp32_from_fp16_momentums, first_runs[0] = self.get_momentums(fp32_from_fp16_params)
+
+                    fp16_set = [fp32_from_fp16_grads, fp32_from_fp16_params,
+                                fp32_from_fp16_momentums, fp16_model_params]
+                else:
+                    fp16_model_params = [p for p in stash.fp16_groups[gid] if p.grad is not None]
+                    fp16_model_grads = [p.grad for p in stash.fp16_groups[gid] if p.grad is not None]
+                    fp32_from_fp16_params = [p for i, p in enumerate(
+                        stash.fp32_from_fp16_groups[gid]) if stash.fp16_groups[gid][i].grad is not None]
+                    fp32_from_fp16_momentums, first_runs[0] = self.get_momentums(fp32_from_fp16_params)
+
+                    fp16_set = [fp16_model_grads, fp32_from_fp16_params,
+                                fp32_from_fp16_momentums, fp16_model_params]
+
+                launch_sets= [fp16_set, [fp32_grads, fp32_params, fp32_momentums]]
+            else:
+                fp16_params = [p for p in group['params'] if (p.dtype == torch.float16 and p.grad is not None)]
+                fp16_grads = [p.grad for p in group['params'] if (p.dtype == torch.float16 and p.grad is not None)]
+                fp16_momentums, first_runs[0] = self.get_momentums(fp16_params)
+
+                fp32_params = [p for p in group['params'] if (p.dtype == torch.float32 and p.grad is not None)]
+                fp32_grads = [p.grad for p in group['params'] if (p.dtype == torch.float32 and p.grad is not None)]
+                fp32_momentums, first_runs[1] = self.get_momentums(fp32_params)
+
+                launch_sets = [[fp16_grads, fp16_params, fp16_momentums],
+                               [fp32_grads, fp32_params, fp32_momentums]]
+
+            for s, (launch_set, first_run) in enumerate(zip(launch_sets, first_runs)):
+                assert len(launch_set[0]) == len(launch_set[1])
+                assert len(launch_set[0]) == len(launch_set[2])
+                if len(launch_set[0]) > 0:
+                    multi_tensor_applier(
+                        self.multi_tensor_sgd,
+                        self._dummy_overflow_buf,
+                        launch_set,
+                        weight_decay,
+                        momentum,
+                        dampening,
+                        group['lr'],
+                        nesterov,
+                        first_run,
+                        self.wd_after_momentum,
+                        1.0/self.most_recent_scale)
+
+        self.most_recent_scale = 1.0
+        self.scale_set_by_backward = False
+
+        return loss
--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -44,7 +44,7 @@ def apply_flat_dist_call(bucket, call, extra_args=None):

    if call is dist.all_reduce:
        coalesced /= dist.get_world_size()
-        
+
    for buf, synced in zip(bucket, unflatten(coalesced, bucket)):
        buf.copy_(synced)

@@ -54,7 +54,7 @@ def split_half_float_double(tensors):
    for i, dtype in enumerate(dtypes):
        bucket = [t for t in tensors if t.type() == dtype]
        if bucket:
-            buckets.append(bucket) 
+            buckets.append(bucket)
    return buckets

 def split_by_type(tensors):
@@ -69,12 +69,12 @@ def split_by_type(tensors):
 # flat_dist_call organizes 'tensors' by type.
 def flat_dist_call(tensors, call, extra_args=None):
    buckets = split_by_type(tensors)
-                    
+
    for tp in buckets:
        bucket = buckets[tp]
        apply_flat_dist_call(bucket, call, extra_args)

-            
+
 def extract_tensors(maybe_tensor, tensor_list):
    if torch.is_tensor(maybe_tensor):
        tensor_list.append(maybe_tensor)
@@ -85,7 +85,7 @@ def extract_tensors(maybe_tensor, tensor_list):
        except TypeError:
            return

-        
+
 class Reducer(object):
    """
    :class:`apex.parallel.Reducer` is a simple class that helps allreduce a module's parameters
@@ -93,13 +93,13 @@ class Reducer(object):
    Unlike :class:`DistributedDataParallel`, :class:`Reducer` will not automatically allreduce
    parameters during ``backward()``.
    Instead, :class:`Reducer` waits for the user to call ``<reducer_instance>.reduce()`` manually.
-    This enables, for example, delaying the allreduce to be carried out every 
+    This enables, for example, delaying the allreduce to be carried out every
    several iterations instead of every single iteration.

-    Like :class:`DistributedDataParallel`, :class:`Reducer` averages any tensors it allreduces 
+    Like :class:`DistributedDataParallel`, :class:`Reducer` averages any tensors it allreduces
    over the number of participating processes.

-    :class:`Reducer` is designed to work with the upstream launch utility script 
+    :class:`Reducer` is designed to work with the upstream launch utility script
    ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``.
    When used with this launcher, :class:`Reducer` assumes 1:1 mapping of processes to GPUs.
    It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
@@ -107,7 +107,7 @@ class Reducer(object):
    Args:
        module_or_grads_list: Either a network definition (module) being run in multi-gpu/distributed mode, or an iterable of gradients to be reduced.  If a module is passed in, the Reducer constructor will sync the parameters across processes (broadcasting from rank 0) to make sure they're all initialized with the same values.  If a list of gradients (that came from some module) is passed in, the user is responsible for manually syncing that module's parameters at the beginning of training.
    """
-    
+
    def __init__(self, module_or_grads_list):
        if isinstance(module_or_grads_list, Module):
            self.module = module_or_grads_list
@@ -117,26 +117,26 @@ class Reducer(object):
            self.module = None
            self.grads = []
            extract_tensors(module_or_grads_list, self.grads)
-            
+
    def reduce(self):
        if self.module:
            grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
            flat_dist_call(grads, dist.all_reduce)
        else:
            flat_dist_call(self.grads, dist.all_reduce)
-            
-            
+
+
 class DistributedDataParallel(Module):
    """
    :class:`apex.parallel.DistributedDataParallel` is a module wrapper that enables
    easy multiprocess distributed data parallel training, similar to ``torch.nn.parallel.DistributedDataParallel``.  Parameters are broadcast across participating processes on initialization, and gradients are
    allreduced and averaged over processes during ``backward()``.

-    :class:`DistributedDataParallel` is optimized for use with NCCL.  It achieves high performance by 
+    :class:`DistributedDataParallel` is optimized for use with NCCL.  It achieves high performance by
    overlapping communication with computation during ``backward()`` and bucketing smaller gradient
    transfers to reduce the total number of transfers required.

-    :class:`DistributedDataParallel` is designed to work with the upstream launch utility script 
+    :class:`DistributedDataParallel` is designed to work with the upstream launch utility script
    ``torch.distributed.launch`` with ``--nproc_per_node <= number of gpus per node``.
    When used with this launcher, :class:`DistributedDataParallel` assumes 1:1 mapping of processes to GPUs.
    It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.
@@ -159,19 +159,23 @@ class DistributedDataParallel(Module):

    """

-    def __init__(self, 
-                 module, 
-                 message_size=10000000, 
-                 delay_allreduce=False, 
+    def __init__(self,
+                 module,
+                 message_size=10000000,
+                 delay_allreduce=False,
                 shared_param=None,
                 allreduce_trigger_params=None,
                 retain_allreduce_buffers=False,
                 allreduce_always_fp32=False,
+                 num_allreduce_streams=1,
+                 allreduce_communicators=None,
                 gradient_average=True,
-                 gradient_predivide_factor=1.0):
+                 gradient_predivide_factor=1.0,
+                 gradient_average_split_factor=None,
+                 prof=False):
        super(DistributedDataParallel, self).__init__()

-        # Backward/forward compatibility around 
+        # Backward/forward compatibility around
        # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and
        # https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86
        if hasattr(dist, "get_backend"):
@@ -181,13 +185,26 @@ class DistributedDataParallel(Module):
            else:
                self.backend_enum_holder = dist.Backend
        else:
-            self._backend = dist._backend 
+            self._backend = dist._backend
            self.backend_enum_holder = dist.dist_backend

        self.warn_on_half = True if self._backend == self.backend_enum_holder.GLOO else False

+        self.prof = prof
+
+        self.allreduce_different_streams = (num_allreduce_streams > 1)
+        self.num_allreduce_streams = num_allreduce_streams
+        self.allreduce_communicators = allreduce_communicators
+        if self.allreduce_communicators:
+            assert len(allreduce_communicators[0]) == num_allreduce_streams
+            assert len(allreduce_communicators[0]) == len(allreduce_communicators[1])
+            assert self.allreduce_different_streams
+
+        if self.allreduce_different_streams and delay_allreduce:
+            raise ValueError("self.allreduce_different_streams may only be used if delay_allreduce=False.")
+
        if shared_param is not None:
-            raise ValueError("shared_param is no longer supported as an option.  It was misleadingly named from the start.  It turns out overlapping communication with computation should work fine with shared parameters.  If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead.") 
+            raise ValueError("shared_param is no longer supported as an option.  It was misleadingly named from the start.  It turns out overlapping communication with computation should work fine with shared parameters.  If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead.")

        self.world_size = float(dist.get_world_size())

@@ -199,27 +216,29 @@ class DistributedDataParallel(Module):
        self.custom_allreduce_triggers = False
        if allreduce_trigger_params is not None:
            if delay_allreduce:
-                raise ValueError("Setting allreduce_trigger_params is only valid if delay_allreduce=False.")  
+                raise ValueError("Setting allreduce_trigger_params is only valid if delay_allreduce=False.")
            self.custom_allreduce_triggers = True
            self.allreduce_trigger_params = set([id(param) for param in allreduce_trigger_params])

        self.delay_allreduce = delay_allreduce
        self.message_size = message_size

-        self.reduction_stream = torch.cuda.Stream()
-        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False) 
-        
+        self.main_stream = torch.cuda.current_stream()
+
+        self.bucket_streams = []
+        self.bucket_events = []
+
        self.module = module

        self._disable_allreduce = False
-        
+
        if self._backend == self.backend_enum_holder.NCCL:
            for param in self.module.parameters():
                assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU."

        self.active_params = []

-        self.param_type_to_tmp_i = {"torch.cuda.HalfTensor" : 0, 
+        self.param_type_to_tmp_i = {"torch.cuda.HalfTensor" : 0,
                                    "torch.cuda.FloatTensor" : 1,
                                    "torch.cuda.DoubleTensor" : 2}

@@ -236,15 +255,21 @@ class DistributedDataParallel(Module):

    def __setstate__(self, state):
        super(DistributedDataParallel, self).__setstate__(state)
-        self.reduction_stream = torch.cuda.Stream()
-        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False) 
+        if self.allreduce_different_streams and delay_allreduce:
+            raise ValueError("self.allreduce_different_streams may only be used if delay_allreduce=False.")
+
+        if self.delay_allreduce:
+            self.needs_refresh = True
+
+        self.bucket_streams = []
+        self.bucket_events = []


    def __getstate__(self):
        attrs = copy.copy(self.__dict__)
        if self._backend != self.backend_enum_holder.NCCL:
-            del attrs['self.reduction_stream']
-            del attrs['self.reduction_event']
+            del attrs['self.bucket_streams']
+            del attrs['self.bucket_events']
            return attrs

    def enable_allreduce(self):
@@ -252,9 +277,9 @@ class DistributedDataParallel(Module):

    def disable_allreduce(self):
        self._disable_allreduce = True
-      
-    # Broadcast rank 0's bucket structure across all processes, and have all processes 
-    # regenerate their bucket structures to match. 
+
+    # Broadcast rank 0's bucket structure across all processes, and have all processes
+    # regenerate their bucket structures to match.
    def sync_bucket_structure(self):
        # Append leftover buckets
        for tmp_bucket in self.tmp_buckets:
@@ -264,8 +289,8 @@ class DistributedDataParallel(Module):
        self.num_buckets = len(self.active_i_buckets)
        self.bucket_sizes = [len(bucket) for bucket in self.active_i_buckets]

-        info_tensor = torch.cuda.IntTensor([self.num_buckets] + 
-                                           self.bucket_sizes + 
+        info_tensor = torch.cuda.IntTensor([self.num_buckets] +
+                                           self.bucket_sizes +
                                           list(chain(*self.active_i_buckets)))

        dist.broadcast(info_tensor, 0)
@@ -273,27 +298,27 @@ class DistributedDataParallel(Module):
        info = [int(entry) for entry in info_tensor]

        self.num_buckets = info[0]
-        self.bucket_sizes = info[1:self.num_buckets + 1] 
-        self.buckets = [[None for _ in range(self.bucket_sizes[i])] 
-                        for i in range(self.num_buckets)] 
+        self.bucket_sizes = info[1:self.num_buckets + 1]
+        self.buckets = [[None for _ in range(self.bucket_sizes[i])]
+                        for i in range(self.num_buckets)]
        # Technically, active_i_buckets' work is done.  But the information is still useful to
        # keep around.  Therefore, refresh active_i_buckets based on rank 0 as well.
-        self.active_i_buckets = [[None for _ in range(self.bucket_sizes[i])] 
-                                 for i in range(self.num_buckets)] 
-        
+        self.active_i_buckets = [[None for _ in range(self.bucket_sizes[i])]
+                                 for i in range(self.num_buckets)]
+
        flattened_buckets = info[self.num_buckets + 1:]
        flat_i = 0
-        for bucket_idx in range(self.num_buckets): 
+        for bucket_idx in range(self.num_buckets):
            for bucket_loc in range(self.bucket_sizes[bucket_idx]):
                param_i = flattened_buckets[flat_i]
-                self.active_i_buckets[bucket_idx][bucket_loc] = param_i 
+                self.active_i_buckets[bucket_idx][bucket_loc] = param_i
                self.param_id_to_bucket[id(self.active_params[param_i])] = (bucket_idx, bucket_loc)
-                flat_i += 1 
-        
-        
+                flat_i += 1
+
+
    def create_hooks(self):
        # Fallback hook that's only called at the end of backward.
-        # Used if you deliberately want to delay allreduces to the end, or to refresh the 
+        # Used if you deliberately want to delay allreduces to the end, or to refresh the
        # bucket structure that will be used to overlap communication with computation in later
        # iterations.
        def allreduce_params():
@@ -308,9 +333,10 @@ class DistributedDataParallel(Module):


        def overlapping_backward_epilogue():
-            self.reduction_stream.record_event(self.reduction_event)
-            torch.cuda.current_stream().wait_event(self.reduction_event)
-     
+            for stream, event in zip(self.bucket_streams, self.bucket_events):
+                stream.record_event(event)
+                torch.cuda.current_stream().wait_event(event)
+
            # Sanity checks that all the buckets were kicked off
            if self.next_bucket != self.num_buckets:
                raise RuntimeError("In epilogue, next_bucket ({}) != num_buckets ({}).  ".format(
@@ -320,7 +346,7 @@ class DistributedDataParallel(Module):
            for actual, expected in zip(self.buckets_ready_size, self.bucket_sizes):
                if actual != expected:
                    raise RuntimeError("Some param buckets were not allreduced.")
-           
+

        self.grad_accs = []
        for param in self.module.parameters():
@@ -330,6 +356,9 @@ class DistributedDataParallel(Module):
                    grad_acc = param_tmp.grad_fn.next_functions[0][0]

                    def allreduce_hook(*unused):
+                        if self.prof:
+                            torch.cuda.nvtx.range_push("allreduce_hook")
+
                        if not self._disable_allreduce:
                            if self.delay_allreduce or self.needs_refresh:
                                # TODO:  How do we want to handle multiple backward passes between
@@ -341,8 +370,8 @@ class DistributedDataParallel(Module):

                                    # Float, half, and double tensors are grouped into buckets separately.
                                    current_type = self.param_type_to_tmp_i[param.type()]
-  
-                                    self.tmp_buckets[current_type].append(active_i)                          
+
+                                    self.tmp_buckets[current_type].append(active_i)

                                    ship_tmp_bucket = False
                                    if self.custom_allreduce_triggers:
@@ -359,82 +388,133 @@ class DistributedDataParallel(Module):
                                        self.active_i_buckets.append(self.tmp_buckets[current_type])
                                        self.tmp_buckets[current_type] = []
                                        self.tmp_numels[current_type] = 0
-                                
+
                                if not self.callback_queued:
                                    Variable._execution_engine.queue_callback(allreduce_params)
                                    self.callback_queued = True
                            else:
                                if not self.callback_queued:
                                    Variable._execution_engine.queue_callback(overlapping_backward_epilogue)
-                                    self.callback_queued = True 
+                                    self.callback_queued = True

                                self.comm_ready_buckets(param)
-                        
+
+                        if self.prof:
+                            torch.cuda.nvtx.range_pop()
+
                    grad_acc.register_hook(allreduce_hook)
                    self.grad_accs.append(grad_acc)

                wrapper(param)

-    def allreduce_bucket(self, bucket):
+
+    def _stream_this_bucket(self, bucket_idx):
+        if self.allreduce_different_streams:
+            return self.bucket_streams[bucket_idx%self.num_allreduce_streams]
+        else:
+            return self.bucket_streams[0]
+
+
+    def _event_this_bucket(self, bucket_idx):
+        if self.allreduce_different_streams:
+            return self.bucket_events[bucket_idx%self.num_allreduce_streams]
+        else:
+            return self.bucket_events[0]
+
+
+    def allreduce_bucket(self, bucket, bucket_idx, force_default_stream):
        tensor = flatten(bucket)

-        tensor_to_allreduce = tensor 
+        if force_default_stream:
+            bucket_stream = self.main_stream
+        else:
+            bucket_stream = self._stream_this_bucket(bucket_idx)
+            bucket_event = self._event_this_bucket(bucket_idx)
+            torch.cuda.current_stream().record_event(bucket_event)
+            bucket_stream.wait_event(bucket_event)
+
+        with torch.cuda.stream(bucket_stream):
+            # self.main_stream.wait_stream(torch.cuda.current_stream())
+            # torch.cuda.synchronize()
+
+            tensor_to_allreduce = tensor

-        if self.allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float() 
+            if self.allreduce_always_fp32:
+                tensor_to_allreduce = tensor.float()

-        if self.gradient_predivide_factor != 1.0:
-            tensor_to_allreduce.mul_(1./self.gradient_predivide_factor)
+            if self.gradient_predivide_factor != 1.0:
+                tensor_to_allreduce.mul_(1./self.gradient_predivide_factor)

-        dist.all_reduce(tensor_to_allreduce)
+            if self.allreduce_different_streams and not force_default_stream:
+                dist.all_reduce(tensor_to_allreduce, group=self.bucket_pgs[bucket_idx%self.num_allreduce_streams])
+            else:
+                dist.all_reduce(tensor_to_allreduce)

-        if self.gradient_average:
-            if self.gradient_predivide_factor != self.world_size:
+            if self.gradient_average:
                tensor_to_allreduce.mul_(self.gradient_predivide_factor/self.world_size)

-        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            tensor.copy_(tensor_to_allreduce)
- 
+            if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
+                tensor.copy_(tensor_to_allreduce)
+
+            if not self.retain_allreduce_buffers:
+                if multi_tensor_applier.available:
+                    multi_tensor_applier(
+                        self.multi_tensor_scale,
+                        self._overflow_buf,
+                        [unflatten(tensor, bucket), bucket],
+                        1.0)
+                else:
+                    for buf, synced in zip(bucket, unflatten(tensor, bucket)):
+                        buf.copy_(synced)
+
+            # I think we actually do need this here.  After allreduce_bucket returns, tensor will
+            # eventually go out of scope and die, at which point it could otherwise be freed for
+            # further reuse by the main stream while the allreduce/div/unflatten are underway in bucket_stream.
+            tensor.record_stream(bucket_stream)
+
        return tensor
-    

-    def allreduce_maybe_retain(self, bucket, bucket_idx=-1):
-        allreduced = self.allreduce_bucket(bucket)
+
+    def allreduce_maybe_retain(self, bucket, bucket_idx, force_default_stream=False):
+        allreduced = self.allreduce_bucket(bucket, bucket_idx, force_default_stream)
        if self.retain_allreduce_buffers:
            if self.allreduce_buffers[bucket_idx] is not None:
                raise RuntimeError("The backward pass is attempting to replace an already-filled "
                                   "allreduce buffer.  This is almost certainly an error.")
            self.allreduce_buffers[bucket_idx] = allreduced
-        else:
-            if multi_tensor_applier.available:
-                multi_tensor_applier(
-                    self.multi_tensor_scale,
-                    self._overflow_buf,
-                    [unflatten(allreduced, bucket), bucket],
-                    1.0)
-            else:
-                for buf, synced in zip(bucket, unflatten(allreduced, bucket)):
-                    buf.copy_(synced)
+            for view, grad in zip(unflatten(allreduced, bucket), bucket):
+                grad.data = view
+            # for buf, synced in zip(bucket, unflatten(allreduced, bucket)):
+            #     buf.copy_(synced)


    def allreduce_fallback(self):
-        grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
+        for stream, event in zip(self.bucket_streams, self.bucket_events):
+            stream.record_event(event)
+            torch.cuda.current_stream().wait_event(event)
+
+        if self.retain_allreduce_buffers:
+            grads = [param.grad for param in self.module.parameters() if param.grad is not None]
+        else:
+            grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]

        split_buckets = split_half_float_double(grads)

        # If retain_allreduce_buffers is True and delay_allreduce is False,
-        # this will only be done during the first backward pass, ignored by the 
-        # training script, and overwritten in the next forward pass.  So it's harmless. 
+        # this will only be done during the first backward pass, ignored by the
+        # training script, and overwritten in the next forward pass.  So it's harmless.
        if self.retain_allreduce_buffers:
            self.allreduce_buffers = [None for _ in range(len(split_buckets))]
-    
+
        for i, bucket in enumerate(split_buckets):
-            allreduced = self.allreduce_maybe_retain(bucket, i)
+            allreduced = self.allreduce_maybe_retain(bucket, i, force_default_stream=True)


    def comm_ready_buckets(self, param):
        # Need to do this in every hook for compatibility with Ruberry's streaming backward PR.
        # self.reduction_stream.wait_stream(torch.cuda.current_stream())
+        if self.prof:
+            torch.cuda.nvtx.range_push("comm_ready_buckets")

        bucket_idx, bucket_loc = self.param_id_to_bucket[id(param)]

@@ -442,39 +522,46 @@ class DistributedDataParallel(Module):
            raise RuntimeError("The backward pass is attempting to replace an already-filled "
                               "bucket slot.  This is almost certainly an error.")

-        self.buckets[bucket_idx][bucket_loc] = param.grad.data
+        if self.retain_allreduce_buffers:
+            self.buckets[bucket_idx][bucket_loc] = param.grad
+        else:
+            self.buckets[bucket_idx][bucket_loc] = param.grad.data
+
        self.buckets_ready_size[bucket_idx] += 1

        if self.buckets_ready_size[bucket_idx] == self.bucket_sizes[bucket_idx]:
            if bucket_idx == self.next_bucket:
-                torch.cuda.current_stream().record_event(self.reduction_event)
-                self.reduction_stream.wait_event(self.reduction_event)
-                with torch.cuda.stream(self.reduction_stream):
-                    self.allreduce_maybe_retain(self.buckets[bucket_idx], bucket_idx)
-
-                    self.next_bucket += 1
-
-                    # Reversing upstream's logic here, because we constructed our buckets based on
-                    # the order things were received during backward.
-                    if len(self.ready_buckets_not_reduced) > 0:
-                        sorted_todo = sorted(self.ready_buckets_not_reduced)
-                        for i in sorted_todo:
-                            # Nothing can be reduced now
-                            if i > self.next_bucket:
-                                break
-                            elif i == self.next_bucket:
-                                self.allreduce_maybe_retain(self.buckets[i], i)
-                                self.ready_buckets_not_reduced.remove(i)
-                                self.next_bucket += 1 
-                            else:
-                                raise ValueError("i should always be >= next_bucket")
+                self.allreduce_maybe_retain(self.buckets[bucket_idx], bucket_idx)
+
+                self.next_bucket += 1
+
+                # Reversing upstream's logic here, because we constructed our buckets based on
+                # the order things were received during backward.
+                if len(self.ready_buckets_not_reduced) > 0:
+                    sorted_todo = sorted(self.ready_buckets_not_reduced)
+                    for i in sorted_todo:
+                        # Nothing can be reduced now
+                        if i > self.next_bucket:
+                            break
+                        elif i == self.next_bucket:
+                            self.allreduce_maybe_retain(self.buckets[i], i)
+                            self.ready_buckets_not_reduced.remove(i)
+                            self.next_bucket += 1
+                        else:
+                            raise ValueError("i should always be >= next_bucket")
            else:
                self.ready_buckets_not_reduced.add(bucket_idx)

-        
+        if self.prof:
+            torch.cuda.nvtx.range_pop()
+
+
    def forward(self, *inputs, **kwargs):
        result = self.module(*inputs, **kwargs)
-       
+
+        if self.prof:
+            torch.cuda.nvtx.range_push("forward pass DDP logic")
+
        if not self._disable_allreduce:
            if not self.delay_allreduce:
                param_list = [param for param in self.module.parameters() if param.requires_grad]
@@ -483,7 +570,7 @@ class DistributedDataParallel(Module):
                # Forward has the authority to set needs_refresh to True, but only allreduce_params
                # in backward has the authority to set needs_refresh to False.
                # Parentheses are not necessary for correct order of operations, but make the intent clearer.
-                if ((not self.active_params) or 
+                if ((not self.active_params) or
                    (len(param_list) != len(self.active_params)) or
                    any([param1 is not param2 for param1, param2 in zip(param_list, self.active_params)])):
                    self.needs_refresh = True
@@ -494,19 +581,59 @@ class DistributedDataParallel(Module):
                    self.tmp_buckets = [[], [], []] # [running half, float, double buckets]
                    self.tmp_numels = [0, 0, 0]
                    self.bucket_sizes = []
-                    self.param_id_to_active_i = {id(param) : i for i, param in enumerate(param_list)}  
+                    self.param_id_to_active_i = {id(param) : i for i, param in enumerate(param_list)}
                    self.param_id_to_bucket = {}
+                    self.bucket_pgs = []
+                    self.bucket_streams = []
+                    self.bucket_events = []
                else:
-                    self.buckets = [[None for _ in range(self.bucket_sizes[i])] 
-                                   for i in range(self.num_buckets)] 
+                    # self.buckets = [[None for _ in range(self.bucket_sizes[i])]
+                    #                 for i in range(self.num_buckets)]
+                    if not self.buckets:
+                        self.buckets = [[None for _ in range(self.bucket_sizes[i])]
+                                        for i in range(self.num_buckets)]
+                    else:
+                        assert len(self.buckets) == self.num_buckets, "len(buckets) = {}, expected {}".format(
+                            len(self.buckets), self.num_buckets)
+                        for b, bucket in enumerate(self.buckets):
+                            assert len(bucket) == self.bucket_sizes[b], "len(buckets[{}]) = {}, expected {})".format(
+                                b, len(buckets[b]), self.bucket_sizes[b])
+                            for i in range(len(bucket)):
+                                bucket[i] = None
+
+                    if self.allreduce_communicators:
+                        self.bucket_pgs = self.allreduce_communicators[0]
+                        self.bucket_streams = self.allreduce_communicators[1]
+                        self.bucket_events = [torch.cuda.Event(enable_timing=False,
+                                            blocking=False) for _ in range(self.num_allreduce_streams)]
+                    else:
+                        if self.allreduce_different_streams:
+                            if not self.bucket_pgs:
+                                self.bucket_pgs = [dist.new_group() for _ in range(self.num_allreduce_streams)]
+                                for i, bg in enumerate(self.bucket_pgs):
+                                    print("rank {} created group {} with backend {}".format(
+                                          dist.get_rank(), i, dist.get_backend(bg)))
+                        if self.allreduce_different_streams:
+                            if not self.bucket_streams:
+                                self.bucket_streams = [torch.cuda.Stream() for _ in range(self.num_allreduce_streams)]
+                                self.bucket_events = [torch.cuda.Event(enable_timing=False,
+                                                      blocking=False) for _ in range(self.num_allreduce_streams)]
+                        else:
+                            if not self.bucket_streams:
+                                self.bucket_streams = [torch.cuda.Stream()]
+                                self.bucket_events = [torch.cuda.Event(enable_timing=False, blocking=False)]
+
                    self.buckets_ready_size = [0 for i in range(self.num_buckets)]
                    if(self.retain_allreduce_buffers):
                        self.allreduce_buffers = [None for _ in range(self.num_buckets)]
                    self.next_bucket = 0
                    self.ready_buckets_not_reduced = set()
-            
+
                self.active_params = param_list

            self.callback_queued = False
-        
+
+        if self.prof:
+            torch.cuda.nvtx.range_pop()
+
        return result
--- a/apex/parallel/optimized_sync_batchnorm.py
+++ b/apex/parallel/optimized_sync_batchnorm.py
@@ -55,10 +55,11 @@ class SyncBatchNorm(_BatchNorm):
        >>> inp = torch.randn(10, 14, 14, 100).cuda()
    """

-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, process_group=None, channel_last=False):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, process_group=None, channel_last=False, fuse_relu=False):
        super(SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
        self.process_group = process_group
        self.channel_last = channel_last
+        self.fuse_relu = fuse_relu

    def _specify_process_group(self, process_group):
        self.process_group = process_group
@@ -66,11 +67,11 @@ class SyncBatchNorm(_BatchNorm):
    def _specify_channel_last(self, channel_last):
        self.channel_last = channel_last

-    def forward(self, input):
+    def forward(self, input, z = None):
        # if input.dim() == 2, we switch to channel_last for efficient memory accessing
        channel_last = self.channel_last if input.dim() != 2 else True

-        if not self.training and self.track_running_stats and not channel_last:
+        if not self.training and self.track_running_stats and not self.channel_last and not self.fuse_relu and z == None:
            # fall back to pytorch implementation for inference
            return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
        else:
@@ -81,4 +82,4 @@ class SyncBatchNorm(_BatchNorm):
                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
                else:
                    exponential_average_factor = self.momentum
-            return SyncBatchnormFunction.apply(input, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last)
+            return SyncBatchnormFunction.apply(input, z, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, self.channel_last, self.fuse_relu)
--- a/apex/parallel/optimized_sync_batchnorm_kernel.py
+++ b/apex/parallel/optimized_sync_batchnorm_kernel.py
@@ -7,7 +7,7 @@ from apex.parallel import ReduceOp
 class SyncBatchnormFunction(Function):

    @staticmethod
-    def forward(ctx, input, weight, bias, running_mean, running_variance, eps, track_running_stats = True, momentum = 1.0, process_group = None, channel_last = False):
+    def forward(ctx, input, z, weight, bias, running_mean, running_variance, eps, track_running_stats = True, momentum = 1.0, process_group = None, channel_last = False, fuse_relu = False):
        torch.cuda.nvtx.range_push("sync_BN_fw")
        input = input.contiguous()
        world_size = 0
@@ -53,13 +53,14 @@ class SyncBatchnormFunction(Function):
            mean = running_mean.data
            inv_std = 1.0 / torch.sqrt(running_variance.data + eps)

-        ctx.save_for_backward(input, weight, mean, inv_std)
+        ctx.save_for_backward(input, weight, mean, inv_std, z, bias)
        ctx.process_group = process_group
        ctx.channel_last = channel_last
        ctx.world_size = world_size
+        ctx.fuse_relu = fuse_relu

        if channel_last:
-            out = syncbn.batchnorm_forward_c_last(input, mean, inv_std, weight, bias)
+            out = syncbn.batchnorm_forward_c_last(input, z, mean, inv_std, weight, bias, fuse_relu)
        else:
            out = syncbn.batchnorm_forward(input, mean, inv_std, weight, bias)

@@ -73,11 +74,17 @@ class SyncBatchnormFunction(Function):
        # mini batch mean & var are calculated by forward path.
        # mu = 1./N*np.sum(h, axis = 0)
        # var = 1./N*np.sum((h-mu)**2, axis = 0)
-        saved_input, weight, mean, inv_std = ctx.saved_tensors
+        saved_input, weight, mean, inv_std, z, bias = ctx.saved_tensors
        process_group = ctx.process_group
        channel_last = ctx.channel_last
        world_size = ctx.world_size
-        grad_input = grad_weight = grad_bias = None
+        fuse_relu = ctx.fuse_relu
+        grad_input = grad_z = grad_weight = grad_bias = None
+
+        if fuse_relu:
+            grad_output = syncbn.relu_bw_c_last(grad_output, saved_input, z, mean, inv_std, weight, bias)
+        if isinstance(z, torch.Tensor) and ctx.needs_input_grad[1]:
+            grad_z = grad_output.clone()

        # TODO(jie): why do I have to clone here? life time of grad_output?
        if channel_last:
@@ -100,11 +107,11 @@ class SyncBatchnormFunction(Function):
            else:
                grad_input = syncbn.batchnorm_backward(grad_output, saved_input, mean, inv_std, weight, mean_dy, mean_dy_xmu)

-        if weight is None or not ctx.needs_input_grad[1]:
+        if weight is None or not ctx.needs_input_grad[2]:
            grad_weight = None

-        if weight is None or not ctx.needs_input_grad[2]:
+        if weight is None or not ctx.needs_input_grad[3]:
            grad_bias = None

        torch.cuda.nvtx.range_pop()
-        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None, None
+        return grad_input, grad_z, grad_weight, grad_bias, None, None, None, None, None, None, None, None
--- a/csrc/amp_C_frontend.cpp
+++ b/csrc/amp_C_frontend.cpp
@@ -6,6 +6,19 @@ void multi_tensor_scale_cuda(
  std::vector<std::vector<at::Tensor>> tensor_lists,
  float scale);

+void multi_tensor_sgd_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  float wd,
+  float momentum,
+  float dampening,
+  float lr,
+  bool nesterov,
+  bool first_run,
+  bool wd_after_momentum,
+  float scale);
+
 void multi_tensor_axpby_cuda(
  int chunk_size,
  at::Tensor noop_flag,
@@ -21,28 +34,74 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
  at::optional<bool> per_tensor_python);

 void multi_tensor_lamb_stage1_cuda(
+    int chunk_size,
+    at::Tensor noop_flag,
+    std::vector<std::vector<at::Tensor>> tensor_lists,
+    at::Tensor per_tensor_decay,
+    const int step,
+    const float beta1,
+    const float beta2,
+    const float epsilon,
+    const float global_grad_norm,
+    const float max_global_grad_norm);
+
+void multi_tensor_lamb_stage2_cuda(
+    int chunk_size,
+    at::Tensor noop_flag,
+    std::vector<std::vector<at::Tensor>> tensor_lists,
+    at::Tensor per_tensor_param_norm,
+    at::Tensor per_tensor_update_norm,
+    const float step_size);
+
+void multi_tensor_adam_cuda(
  int chunk_size,
  at::Tensor noop_flag,
  std::vector<std::vector<at::Tensor>> tensor_lists,
-  at::Tensor per_tensor_decay,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
  const int step,
+  const int mode,
+  const int bias_correction,
+  const float weight_decay);
+
+void multi_tensor_novograd_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor grad_norms,
+  const float lr,
  const float beta1,
  const float beta2,
  const float epsilon,
-  const float global_grad_norm,
-  const float max_global_grad_norm);
+  const int step,
+  const int bias_correction,
+  const float weight_decay,
+  const int grad_averaging,
+  const int mode,
+  const int norm_type);

-void multi_tensor_lamb_stage2_cuda(
+void multi_tensor_lamb_cuda(
  int chunk_size,
  at::Tensor noop_flag,
  std::vector<std::vector<at::Tensor>> tensor_lists,
-  at::Tensor per_tensor_param_norm,
-  at::Tensor per_tensor_update_norm,
-  const float step_size);
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int bias_correction,
+  const float weight_decay,
+  const int grad_averaging,
+  const int mode,
+  const float max_grad_norm);

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("multi_tensor_scale", &multi_tensor_scale_cuda,
        "Fused overflow check + scale for a list of contiguous tensors");
+  m.def("multi_tensor_sgd", &multi_tensor_sgd_cuda,
+        "Fused SGD optimizer for list of contiguous tensors");
  m.def("multi_tensor_axpby", &multi_tensor_axpby_cuda,
        "out = a*x + b*y for a list of contiguous tensors");
  m.def("multi_tensor_l2norm", &multi_tensor_l2norm_cuda,
@@ -51,4 +110,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "Computes update part of LAMB optimizer");
  m.def("multi_tensor_lamb_stage2_cuda", &multi_tensor_lamb_stage2_cuda,
        "Completes application of gradient to parameters for LAMB optimizer");
+  m.def("multi_tensor_adam", &multi_tensor_adam_cuda,
+        "Compute and apply gradient update to parameters for Adam optimizer");
+  m.def("multi_tensor_novograd", &multi_tensor_novograd_cuda,
+        "Compute and apply gradient update to parameters for Adam optimizer");
+  m.def("multi_tensor_lamb", &multi_tensor_lamb_cuda,
+        "Computes and apply update for LAMB optimizer");
 }
--- a/csrc/fused_adam_cuda.cpp
+++ b/csrc/fused_adam_cuda.cpp
@@ -3,6 +3,9 @@
 // CUDA forward declaration
 void fused_adam_cuda(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay);

+void fused_adam_cuda_mt(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay);
+
+
 #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
@@ -25,4 +28,5 @@ void adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, a

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        m.def("adam", &adam, "Adam optimized CUDA implementation.");
+        m.def("adam_mt", &fused_adam_cuda_mt, "Multi tensor Adam optimized CUDA implementation.");
 }
--- a/csrc/fused_adam_cuda_kernel.cu
+++ b/csrc/fused_adam_cuda_kernel.cu
@@ -9,6 +9,10 @@
 // #include "ATen/Type.h"
 #include "ATen/AccumulateType.h"
 #include <THC/THCGeneral.h>
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4

 #include "type_shim.h"

@@ -55,6 +59,93 @@ __global__ void adam_cuda_kernel(
        }
 }

+template <int DEPTH, typename T, typename GRAD_T>
+struct AdamFunctor
+{
+    __device__ __forceinline__ void operator()(
+        int chunk_size,
+        volatile int* noop_gmem,
+        TensorListMetadata<DEPTH>& tl,
+        const float b1,
+        const float b2,
+        const float eps,
+        const float grad_scale,
+        const float step_size,
+        adamMode_t mode,
+        const float decay)
+    {
+        int tensor_loc = tl.block_to_tensor[blockIdx.x];
+        int chunk_idx = tl.block_to_chunk[blockIdx.x];
+        int n = tl.sizes[tensor_loc];
+
+        T* p = (T *)tl.addresses[0][tensor_loc];
+        p += chunk_idx*chunk_size;
+        T* m = (T *)tl.addresses[1][tensor_loc];
+        m += chunk_idx*chunk_size;
+        T* v = (T *)tl.addresses[2][tensor_loc];
+        v += chunk_idx*chunk_size;
+        GRAD_T* g = (GRAD_T *)tl.addresses[3][tensor_loc];
+        g += chunk_idx*chunk_size;
+        GRAD_T* p_copy = NULL;
+        if (DEPTH == 5) {
+            p_copy = (GRAD_T *)tl.addresses[4][tensor_loc];
+            p_copy += chunk_idx*chunk_size;
+        }
+
+        n -= chunk_idx*chunk_size;
+
+        T incoming_p[ILP];
+        T incoming_m[ILP];
+        T incoming_v[ILP];
+        T incoming_g[ILP];
+
+        for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP) {
+
+            #pragma unroll
+            for(int ii = 0; ii < ILP; ii++) {
+                incoming_p[ii] = 0;
+                incoming_m[ii] = 0;
+                incoming_v[ii] = 0;
+                incoming_g[ii] = 0;
+
+                int i = i_start + threadIdx.x + ii*blockDim.x;
+                if (i < n && i < chunk_size) {
+                    incoming_p[ii] = p[i];
+                    incoming_m[ii] = m[i];
+                    incoming_v[ii] = v[i];
+                    incoming_g[ii] = static_cast<T>(g[i]);
+                }
+            }
+
+            // note for clarification to future michael:
+            // From a pure memory dependency perspective, there's likely no point unrolling
+            // the write loop, since writes just fire off once their LDGs arrive.
+            // Put another way, the STGs are dependent on the LDGs, but not on each other.
+            // There is still compute ILP benefit from unrolling the loop though.
+            #pragma unroll
+            for(int ii = 0; ii < ILP; ii++) {
+                int j = i_start + threadIdx.x + ii*blockDim.x;
+
+                if(j < n && j < chunk_size) {
+                    T scaled_grad = incoming_g[ii]/grad_scale;
+                    m[j] = b1*incoming_m[ii] + (1-b1)*scaled_grad;
+                    v[j] = b2*incoming_v[ii] + (1-b2)*scaled_grad*scaled_grad;
+                    float denom;
+                    if (mode == ADAM_MODE_0)
+                        denom = sqrtf(v[j] + eps);
+                    else // Mode 1
+                        denom = sqrtf(v[j]) + eps;
+                    float update = (m[j]/denom) + (decay*incoming_p[ii]);
+                    p[j] = incoming_p[ii] - (step_size*update);
+                    if (DEPTH == 5)  p_copy[j] = (GRAD_T) p[j];
+                }
+            }
+        }
+    }
+};
+
 void fused_adam_cuda(
        at::Tensor & p,
        at::Tensor & p_copy,
@@ -96,7 +187,7 @@ void fused_adam_cuda(
            AT_ASSERTM(p.scalar_type() == at::ScalarType::Float, "expected parameter to be of float type");
 //dispatch is done on the gradient type
            using namespace at; // prevents "toString is undefined" errors
-            DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel", 
+            DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel",
                using accscalar_t = at::acc_type<scalar_t_0, true>;
                adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
                        p.data<accscalar_t>(),
@@ -112,7 +203,7 @@ void fused_adam_cuda(
                        tsize,
                        (adamMode_t) mode,
                        decay);
-                )
+                );
      } else {
            using namespace at;
            DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
@@ -135,3 +226,110 @@ void fused_adam_cuda(
      THCudaCheck(cudaGetLastError());

 }
+
+void fused_adam_cuda_mt(
+    int chunk_size,
+    at::Tensor noop_flag,
+    std::vector<std::vector<at::Tensor>> tensor_lists, // p, m, v, g, p_copy
+    float lr,
+    float beta1,
+    float beta2,
+    float eps,
+    float grad_scale,
+    int step,
+    int mode,
+    int bias_correction,
+    float decay) {
+
+    //Constants
+    float step_size = 0;
+    if (bias_correction == 1) {
+        const float bias_correction1 = 1 - std::pow(beta1, step);
+        const float bias_correction2 = 1 - std::pow(beta2, step);
+        step_size = lr * std::sqrt(bias_correction2)/bias_correction1;
+    }
+    else {
+        step_size = lr;
+    }
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    size_t tl_sz = tensor_lists.size();
+    AT_ASSERTM(tl_sz == 4 || tl_sz == 5, "expected tensor lists of size 4 or 5");
+
+    if (tensor_lists[3][0].scalar_type() == at::ScalarType::Half) {
+//alher values should be fp32 for half gradients
+        AT_ASSERTM(tensor_lists[0][0].scalar_type() == at::ScalarType::Float, "expected parameter to be of float type");
+//dich is done on the gradient type
+        if (tl_sz == 5) {
+            DISPATCH_FLOAT_AND_HALF(tensor_lists[3][0].scalar_type(), 0, "adam_cuda_mt_kernel",
+                using accscalar_t = at::acc_type<scalar_t_0, true>;
+                multi_tensor_apply<5>(
+                    BLOCK_SIZE,
+                    chunk_size,
+                    noop_flag,
+                    tensor_lists,
+                    AdamFunctor<5, accscalar_t, scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    (adamMode_t) mode,
+                    decay);
+            );
+        } else {
+            DISPATCH_FLOAT_AND_HALF(tensor_lists[3][0].scalar_type(), 0, "adam_cuda_mt_kernel",
+                using accscalar_t = at::acc_type<scalar_t_0, true>;
+                multi_tensor_apply<4>(
+                    BLOCK_SIZE,
+                    chunk_size,
+                    noop_flag,
+                    tensor_lists,
+                    AdamFunctor<4, accscalar_t, scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    (adamMode_t) mode,
+                    decay);
+            );
+        }
+    } else {
+        if (tl_sz == 5) {
+            DISPATCH_DOUBLE_AND_FLOAT(tensor_lists[3][0].scalar_type(), 0, "adam_cuda_mt_kernel",
+                multi_tensor_apply<5>(
+                    BLOCK_SIZE,
+                    chunk_size,
+                    noop_flag,
+                    tensor_lists,
+                    AdamFunctor<5, scalar_t_0, scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    (adamMode_t) mode,
+                    decay);
+            );
+        } else {
+            DISPATCH_DOUBLE_AND_FLOAT(tensor_lists[3][0].scalar_type(), 0, "adam_cuda_mt_kernel",
+                multi_tensor_apply<4>(
+                    BLOCK_SIZE,
+                    chunk_size,
+                    noop_flag,
+                    tensor_lists,
+                    AdamFunctor<4, scalar_t_0, scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    (adamMode_t) mode,
+                    decay);
+            );
+        }
+    }
+    THCudaCheck(cudaGetLastError());
+}
+
--- a/csrc/multi_tensor_adam.cu
+++ b/csrc/multi_tensor_adam.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+typedef enum{
+  ADAM_MODE_0   =0, // L2 regularization mode
+  ADAM_MODE_1   =1  // Decoupled weight decay mode(AdamW)
+} adamMode_t;
+
+using MATH_T = float;
+
+template<typename T>
+struct AdamFunctor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<4>& tl,
+    const float beta1,
+    const float beta2,
+    const float beta1_correction,
+    const float beta2_correction,
+    const float epsilon,
+    const float lr,
+    adamMode_t mode,
+    const float decay)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+
+    // potentially use to pass in list of scalar
+    // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    T* g = (T*)tl.addresses[0][tensor_loc];
+    g += chunk_idx*chunk_size;
+
+    T* p = (T*)tl.addresses[1][tensor_loc];
+    p += chunk_idx*chunk_size;
+
+    T* m = (T*)tl.addresses[2][tensor_loc];
+    m += chunk_idx*chunk_size;
+
+    T* v = (T*)tl.addresses[3][tensor_loc];
+    v += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    // see note in multi_tensor_scale_kernel.cu
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      MATH_T r_g[ILP];
+      MATH_T r_p[ILP];
+      MATH_T r_m[ILP];
+      MATH_T r_v[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_g[ii] = g[i];
+          r_p[ii] = p[i];
+          r_m[ii] = m[i];
+          r_v[ii] = v[i];
+        } else {
+          r_g[ii] = MATH_T(0);
+          r_p[ii] = MATH_T(0);
+          r_m[ii] = MATH_T(0);
+          r_v[ii] = MATH_T(0);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        if(mode == ADAM_MODE_0) { // L2
+          r_g[ii] = r_g[ii] + (decay * r_p[ii]);
+          r_m[ii] = beta1 * r_m[ii] + (1-beta1) * r_g[ii];
+          r_v[ii] = beta2 * r_v[ii] + (1-beta2) * r_g[ii] * r_g[ii];
+          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+          MATH_T update = next_m_unbiased / denom;
+          r_p[ii] = r_p[ii] - (lr * update);
+        }
+        else { // weight decay
+          r_m[ii] = beta1 * r_m[ii] + (1-beta1) * r_g[ii];
+          r_v[ii] = beta2 * r_v[ii] + (1-beta2) * r_g[ii] * r_g[ii];
+          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+          MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
+          r_p[ii] = r_p[ii] - (lr * update);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          p[i] = r_p[ii];
+          m[i] = r_m[ii];
+          v[i] = r_v[ii];
+        }
+      }
+    }
+  }
+};
+
+void multi_tensor_adam_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int mode,
+  const int bias_correction,
+  const float weight_decay)
+{
+  using namespace at;
+
+  // Handle bias correction mode
+  float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
+  if (bias_correction == 1) {
+    bias_correction1 = 1 - std::pow(beta1, step);
+    bias_correction2 = 1 - std::pow(beta2, step);
+  }
+
+  // Assume single type across p,g,m1,m2 now
+  DISPATCH_DOUBLE_FLOAT_AND_HALF(
+    tensor_lists[0][0].scalar_type(), 0, "adam",
+    multi_tensor_apply<4>(
+      BLOCK_SIZE,
+      chunk_size,
+      noop_flag,
+      tensor_lists,
+      AdamFunctor<scalar_t_0>(),
+      beta1,
+      beta2,
+      bias_correction1,
+      bias_correction2,
+      epsilon,
+      lr,
+      (adamMode_t) mode,
+      weight_decay); )
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+}
--- a/csrc/multi_tensor_l2norm_kernel.cu
+++ b/csrc/multi_tensor_l2norm_kernel.cu
@@ -75,6 +75,69 @@ struct L2NormFunctor
  }
 };

+// Probably better to template, but since we are not likely to support other norm
+template<typename x_t>
+struct MaxNormFunctor
+{
+  __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<1>& tl,
+    float* output,
+    float* output_per_tensor,
+    bool per_tensor,
+    int max_chunks_per_tensor)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    __shared__ float s_vals[512];
+
+    float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
+    for(int i = 0; i < ILP; i++)
+      vals[i] = 0.f;
+
+    for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+    {
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          float next = static_cast<float>(x[i]);
+          vals[ii] = fmaxf(fabsf(vals[ii]), fabsf(next));
+        }
+      }
+    }
+
+    float val = 0.f;
+    for(int i = 0; i < ILP; i++)
+        val = fmaxf(fabsf(val), fabsf(vals[i]));
+
+    float final = reduce_block_into_lanes_max_op(s_vals, val);
+
+    if(threadIdx.x == 0)
+    {
+      if(!isfinite(final))
+        *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+      output[blockIdx.x] = fmaxf(fabsf(output[blockIdx.x]), fabsf(final));
+      if(per_tensor)
+        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc)*max_chunks_per_tensor + chunk_idx] = final;
+    }
+  }
+};
+

 __global__ void cleanup(
  float* output,
@@ -113,6 +176,63 @@ __global__ void cleanup(
  }
 }

+__global__ void cleanup_v2(
+  float* output,
+  float* output_per_tensor,
+  float* ret,
+  float* ret_per_tensor,
+  bool per_tensor,
+  int max_chunks_per_tensor,
+  int norm_type,
+  float alpha,
+  float beta)
+{
+  __shared__ float vals[512];
+
+  if(blockIdx.x == 0)
+  {
+    float val = 0;
+    if(threadIdx.x < 320)
+      val = output[threadIdx.x];
+
+    if (norm_type == 0) {
+      float final = reduce_block_into_lanes_max_op(vals, val);
+      if(threadIdx.x == 0)
+        *ret = alpha * (*ret) + beta * final;
+    }
+    else {
+      float final = reduce_block_into_lanes(vals, val);
+      if(threadIdx.x == 0)
+        *ret = sqrt(alpha * (*ret) * (*ret) + beta * final);
+    }
+  }
+
+  if(per_tensor)
+  {
+    float* output_this_tensor = output_per_tensor + blockIdx.x*max_chunks_per_tensor;
+
+    if (norm_type == 0) {
+      float val = 0;
+      for(int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x)
+        val = fmaxf(fabsf(val), fabsf(output_this_tensor[i]));
+
+      float final = reduce_block_into_lanes_max_op(vals, val);
+
+      if(threadIdx.x == 0)
+        ret_per_tensor[blockIdx.x] = alpha * ret_per_tensor[blockIdx.x] + beta * final;
+    }
+    else {
+      float val = 0;
+      for(int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x)
+        val += output_this_tensor[i];
+
+      float final = reduce_block_into_lanes(vals, val);
+
+      if(threadIdx.x == 0)
+        ret_per_tensor[blockIdx.x] = sqrt(alpha * ret_per_tensor[blockIdx.x] * ret_per_tensor[blockIdx.x] + beta * final);
+    }
+  }
+}

 std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
  int chunk_size,
@@ -178,3 +298,90 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(

  return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
 }
+
+
+// Compute and update grad norm
+// Here use a per tensor norm, and blend new norm(n) and old norm(gn) by
+// L-2: gn = sqrt(a * gn^2 + b * n^2)
+// L-inf: gn = a * gn + b * n
+void multi_tensor_norm_out_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor out,
+  const float alpha,
+  const float beta,
+  const int norm_type)
+{
+  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
+
+  // we don't need global thus uses empty here
+  auto output = at::empty({320}, float_options);
+
+  at::Tensor output_per_tensor;
+  at::Tensor ret_per_tensor;
+
+  int ntensors = tensor_lists[0].size();
+  int max_chunks_per_tensor = -1;
+
+  for(int t = 0; t < ntensors; t++)
+  {
+    int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+    if(max_chunks_this_tensor > max_chunks_per_tensor)
+      max_chunks_per_tensor = max_chunks_this_tensor;
+  }
+
+  // Although it is single write then read, still need to be zero
+  // Since tailing element also participate cleanup
+  output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, float_options);
+
+  if (norm_type == 0) {
+    DISPATCH_FLOAT_AND_HALF(
+      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_maxnorm_cuda",
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        MaxNormFunctor<scalar_t_0>(),
+        output.data<float>(),
+        output_per_tensor.data<float>(),
+        true,
+        max_chunks_per_tensor);)
+  }
+  else {
+    DISPATCH_FLOAT_AND_HALF(
+      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_cuda",
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        L2NormFunctor<scalar_t_0>(),
+        output.data<float>(),
+        output_per_tensor.data<float>(),
+        true,
+        max_chunks_per_tensor);)
+  }
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+
+  // This involves one more small kernel launches, but will be negligible end to end.
+  // I could get rid of these by hacking the functor + multi tensor harness with persistence
+  // logic, but keeping it simple for now
+  auto ret = at::empty({1}, output.options());
+  auto stream = at::cuda::getCurrentCUDAStream();
+  cleanup_v2<<<ntensors, 512, 0, stream>>>(
+    output.data<float>(),
+    output_per_tensor.data<float>(),
+    ret.data<float>(),
+    out.data<float>(),
+    true,
+    max_chunks_per_tensor,
+    norm_type,
+    alpha,
+    beta);
+
+  return ;
+}
--- a/csrc/multi_tensor_lamb.cu
+++ b/csrc/multi_tensor_lamb.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+typedef enum{
+  MOMENT_MODE_0   =0, // L2 regularization mode
+  MOMENT_MODE_1   =1  // Decoupled weight decay mode
+} adamMode_t;
+
+std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::optional<bool> per_tensor_python);
+
+using MATH_T = float;
+
+template<typename T>
+struct LAMBStage1Functor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<4>& tl,
+    const float beta1,
+    const float beta2,
+    const float beta3,
+    const float beta1_correction,
+    const float beta2_correction,
+    const float epsilon,
+    adamMode_t mode,
+    const float decay,
+    float* global_grad_norm,
+    float max_global_grad_norm)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    float clipped_global_grad_norm = (*global_grad_norm) > max_global_grad_norm ? (*global_grad_norm) / max_global_grad_norm : 1.0f;
+
+    T* g = (T*)tl.addresses[0][tensor_loc];
+    g += chunk_idx*chunk_size;
+
+    T* p = (T*)tl.addresses[1][tensor_loc];
+    p += chunk_idx*chunk_size;
+
+    T* m = (T*)tl.addresses[2][tensor_loc];
+    m += chunk_idx*chunk_size;
+
+    T* v = (T*)tl.addresses[3][tensor_loc];
+    v += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    // see note in multi_tensor_scale_kernel.cu
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      MATH_T r_g[ILP];
+      MATH_T r_p[ILP];
+      MATH_T r_m[ILP];
+      MATH_T r_v[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_g[ii] = g[i];
+          // special ?optimization? for lamb stage 1
+          if (decay == 0) {
+            r_p[ii] = MATH_T(0);
+          }
+          else {
+            r_p[ii] = p[i];
+          }
+          r_m[ii] = m[i];
+          r_v[ii] = v[i];
+        } else {
+          r_g[ii] = MATH_T(0);
+          r_p[ii] = MATH_T(0);
+          r_m[ii] = MATH_T(0);
+          r_v[ii] = MATH_T(0);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        if (mode == MOMENT_MODE_0) {
+          MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+          // L2 on scaled grad
+          scaled_grad = scaled_grad + decay*r_p[ii];
+          r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+          r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+          r_p[ii] = next_m_unbiased / denom;
+        }
+        else {
+          MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+          r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+          r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+          r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          g[i] = r_p[ii];
+          m[i] = r_m[ii];
+          v[i] = r_v[ii];
+        }
+      }
+    }
+  }
+};
+
+// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
+// It computes new parameter value.
+template<typename T>
+struct LAMBStage2Functor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<2>& tl,
+    const float* per_tensor_param_norm,
+    const float* per_tensor_update_norm,
+    const float learning_rate)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    float param_norm = per_tensor_param_norm[tensor_num];
+    float update_norm = per_tensor_update_norm[tensor_num];
+    MATH_T ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
+
+    T* update = (T*)tl.addresses[0][tensor_loc];
+    update += chunk_idx*chunk_size;
+
+    T* p = (T*)tl.addresses[1][tensor_loc];
+    p += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      MATH_T r_p[ILP];
+      MATH_T r_update[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+       	int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_p[ii] = p[i];
+          r_update[ii] = update[i];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+       	r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          p[i] = r_p[ii];
+        }
+      }
+    }
+  }
+};
+
+
+void multi_tensor_lamb_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int bias_correction,
+  const float weight_decay,
+  const int grad_averaging,
+  const int mode,
+  const float max_grad_norm)
+{
+  using namespace at;
+  // Master weight and 32bit momentum(potentially changing) is not handled by this
+  // So we assume every tensor are all in the same type
+
+  // Handle bias correction mode
+  float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
+  if (bias_correction == 1) {
+    bias_correction1 = 1 - std::pow(beta1, step);
+    bias_correction2 = 1 - std::pow(beta2, step);
+  }
+
+  // Handle grad averaging mode
+  float beta3 = 1.0f;
+  if (grad_averaging == 1) beta3 = 1 - beta1;
+
+  std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin()+1);
+  std::vector<std::vector<at::Tensor>> param_list(tensor_lists.begin()+1, tensor_lists.begin()+2);
+
+  // Compute global grad norm
+  auto grad_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, grad_list, false);
+
+  // Compute per tensor param norm
+  auto param_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, param_list, true);
+
+  // We now in-place modify grad to store update before compute its norm
+  // Generally this is not a issue since people modify grad in step() method all the time
+  // We can also grab list of empty tensor to avoid this, but I'd like to save space/cpu code
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
+      multi_tensor_apply<4>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        LAMBStage1Functor<scalar_t_0>(),
+        beta1,
+        beta2,
+        beta3, // 1-beta1 or 1 depends on averaging mode
+        bias_correction1,
+        bias_correction2,
+        epsilon,
+        (adamMode_t) mode,
+        weight_decay,
+        std::get<0>(grad_norm_tuple).data<float>(),
+        max_grad_norm); )
+
+  // Compute update norms
+  auto update_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, grad_list, true);
+
+  std::vector<std::vector<at::Tensor>> grad_param_list(tensor_lists.begin(), tensor_lists.begin()+2);
+
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
+      multi_tensor_apply<2>(
+        BLOCK_SIZE,
+        chunk_size,
+       	noop_flag,
+        grad_param_list,
+        LAMBStage2Functor<scalar_t_0>(),
+        std::get<1>(param_norm_tuple).data<float>(),
+        std::get<1>(update_norm_tuple).data<float>(),
+        lr); )
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+}
--- a/csrc/multi_tensor_novograd.cu
+++ b/csrc/multi_tensor_novograd.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+typedef enum{
+  MOMENT_MODE_0   =0, // Novograd paper mode, momentum caculation with denom then decay inside
+  MOMENT_MODE_1   =1  // Decoupled weight decay mode
+} momentMode_t;
+
+void multi_tensor_norm_out_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor out,
+  const float alpha,
+  const float beta,
+  const int norm_type);
+
+using MATH_T = float;
+
+template<typename T>
+struct NovoGradFunctor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<3>& tl,
+    const float beta1,
+    const float beta2,
+    const float beta3,
+    const float beta1_correction,
+    const float beta2_correction,
+    const float epsilon,
+    const float lr,
+    momentMode_t m_mode,
+    const float decay,
+    const float* per_tensor_grad_norm)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    float grad_norm = per_tensor_grad_norm[tensor_num];
+
+    T* g = (T*)tl.addresses[0][tensor_loc];
+    g += chunk_idx*chunk_size;
+
+    T* p = (T*)tl.addresses[1][tensor_loc];
+    p += chunk_idx*chunk_size;
+
+    T* m = (T*)tl.addresses[2][tensor_loc];
+    m += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    // see note in multi_tensor_scale_kernel.cu
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      MATH_T r_g[ILP];
+      MATH_T r_p[ILP];
+      MATH_T r_m[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_g[ii] = g[i];
+          r_p[ii] = p[i];
+          r_m[ii] = m[i];
+        } else {
+          r_g[ii] = MATH_T(0);
+          r_p[ii] = MATH_T(0);
+          r_m[ii] = MATH_T(0);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        if (m_mode == MOMENT_MODE_0) {
+          MATH_T next_v_unbiased = grad_norm / beta2_correction;
+          MATH_T denom = next_v_unbiased + epsilon;
+          r_g[ii] = (r_g[ii] / denom) + (decay * r_p[ii]);
+          r_m[ii] = beta1 * r_m[ii] + beta3 * r_g[ii];
+          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+          r_p[ii] = r_p[ii] - (lr * next_m_unbiased);
+        }
+        else {
+          r_m[ii] = beta1 * r_m[ii] + beta3 * r_g[ii];
+          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+          MATH_T next_v_unbiased = grad_norm / beta2_correction;
+          MATH_T denom = next_v_unbiased + epsilon;
+          MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
+          r_p[ii] = r_p[ii] - (lr * update);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          p[i] = r_p[ii];
+          m[i] = r_m[ii];
+        }
+      }
+    }
+  }
+};
+
+void multi_tensor_novograd_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor grad_norms,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int bias_correction,
+  const float weight_decay,
+  const int grad_averaging,
+  const int moment_mode,
+  const int norm_type)
+{
+  using namespace at;
+
+  // Handle bias correction mode
+  float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
+  if (bias_correction == 1) {
+    bias_correction1 = 1 - std::pow(beta1, step);
+    bias_correction2 = std::sqrt(1 - std::pow(beta2, step));
+  }
+
+  // Handle grad averaging mode
+  float beta3 = 1;
+  if (grad_averaging == 1) beta3 = 1 - beta1;
+
+  std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin()+1);
+
+  // Compute and update grad norm
+  // Here use a per tensor norm, and blend new norm(n) and old norm(gn) by
+  // L-2: gn = sqrt(a * gn^2 + b * n^2)
+  // L-inf: gn = a * gn + b * n
+  multi_tensor_norm_out_cuda(chunk_size, noop_flag, grad_list, grad_norms, beta2, (1.0f - beta2), norm_type);
+
+  // Assume single type across p,g,m1,m2 now
+  DISPATCH_DOUBLE_FLOAT_AND_HALF(
+    tensor_lists[0][0].scalar_type(), 0, "novograd",
+    multi_tensor_apply<3>(
+      BLOCK_SIZE,
+      chunk_size,
+      noop_flag,
+      tensor_lists,
+      NovoGradFunctor<scalar_t_0>(),
+      beta1,
+      beta2,
+      beta3, // 1-beta1 or 1 depends on averaging mode
+      bias_correction1,
+      bias_correction2,
+      epsilon,
+      lr,
+      (momentMode_t) moment_mode,
+      weight_decay,
+      grad_norms.data<float>()); )
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+}
--- a/csrc/multi_tensor_sgd_kernel.cu
+++ b/csrc/multi_tensor_sgd_kernel.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include "multi_tensor_apply.cuh"
+#include "compat.h"
+
+#include <assert.h>
+#include <cuda_runtime.h>
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+/**
+ * Perform fused SGD on multiple buffers
+ * N: number of tensors
+ * tl[0] : gradients
+ * tl[1] : weights
+ * tl[2] : momentum buffers
+ * tl[3] : fp16 weights (if appropriate)
+ * wd : weight_decay (scalar)
+ * momentum : momentum (scalar)
+ * dampening : momentum dampening (scalar)
+ * lr : learning rate (scalar)
+ * nesterov : enable nesterov (bool)
+ * first run : necessary for proper momentum handling & init
+ * wd_after_momentum : apply weight decay _after_ momentum instead of before
+ **/
+template<int N, typename T_grad, typename T_weight>
+struct SGDFunctor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<N>& tl,
+    float wd,
+    float momentum,
+    float dampening,
+    float lr,
+    bool nesterov,
+    bool first_run,
+    bool wd_after_momentum,
+    float scale)
+  {
+    // Early exit if we don't need to do anything
+    if (*noop_gmem) return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    T_grad* grad_in = (T_grad*)tl.addresses[0][tensor_loc];
+    grad_in += chunk_idx*chunk_size;
+
+    T_weight* weight_in = (T_weight*)tl.addresses[1][tensor_loc];
+    weight_in += chunk_idx*chunk_size;
+
+    T_weight* mom_in = (T_weight*)tl.addresses[2][tensor_loc];
+    mom_in += chunk_idx*chunk_size;
+
+    at::Half *model_weights_out = nullptr;
+    if(N == 4)
+    {
+      model_weights_out = (at::Half*)tl.addresses[3][tensor_loc];
+      model_weights_out += chunk_idx*chunk_size;
+    }
+
+    n -= chunk_idx*chunk_size;
+
+    // Non-divergent exit condition for the __syncthreads
+    float incoming_grads[ILP];
+    float incoming_weights[ILP];
+    float incoming_moms[ILP];
+    for(int i_start = 0;
+        i_start < n && i_start < chunk_size;
+        i_start += blockDim.x*ILP)
+    {
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        incoming_grads[ii] = 0;
+        incoming_weights[ii] = 0;
+        incoming_moms[ii] = 0;
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          incoming_grads[ii] = static_cast<float>(grad_in[i])*scale;
+          incoming_weights[ii] = static_cast<float>(weight_in[i]);
+          incoming_moms[ii] = static_cast<float>(mom_in[i]);
+        }
+      }
+
+      // note for clarification to future michael:
+      // From a pure memory dependency perspective, there's likely no point unrolling
+      // the write loop, since writes just fire off once their LDGs arrive.
+      // Put another way, the STGs are dependent on the LDGs, but not on each other.
+      // There is still compute ILP benefit from unrolling the loop though.
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          // apply weight decay before momentum if necessary
+          if(wd != 0.f && !wd_after_momentum)
+            incoming_grads[ii] += wd * incoming_weights[ii];
+
+          if(momentum != 0.f)
+          {
+            if(!first_run)
+              incoming_moms[ii] = incoming_moms[ii] * momentum + (1.f - dampening) * incoming_grads[ii];
+            else // initialize momentums to current incoming grads
+              incoming_moms[ii] = incoming_grads[ii];
+
+            if(nesterov)
+              incoming_grads[ii] += momentum * incoming_moms[ii];
+            else
+              incoming_grads[ii] = incoming_moms[ii];
+          }
+
+          // Apply WD after momentum if desired
+          if(wd != 0.f && wd_after_momentum)
+            incoming_grads[ii] += wd * incoming_weights[ii];
+
+          // adjust the weight and write out
+          weight_in[i] += (-lr * incoming_grads[ii]);
+
+          // if necessary, write out an fp16 copy of the weights
+          if(N == 4)
+            model_weights_out[i] = static_cast<at::Half>(weight_in[i]);
+
+          // also write out the new momentum
+          if(momentum != 0.f)
+            mom_in[i] = incoming_moms[ii];
+        }
+      }
+    }
+  }
+};
+
+void multi_tensor_sgd_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  float wd,
+  float momentum,
+  float dampening,
+  float lr,
+  bool nesterov,
+  bool first_run,
+  bool wd_after_momentum,
+  float scale)
+{
+  auto num_tensors = tensor_lists.size();
+  auto grad_type = tensor_lists[0][0].scalar_type();
+  auto weight_type = tensor_lists[1][0].scalar_type();
+
+  if(num_tensors == 4)
+    for(int i = 0; i < tensor_lists[3].size(); i++)
+        TORCH_CHECK(tensor_lists[3][i].scalar_type() == at::ScalarType::Half,
+                 "Additional output tensors should always be fp16.");
+
+  // We have 3 possibilities to handle here, in terms of
+  // grad_type, param_type, momentum_type, requires_fp16_copy
+  // 1. fp16, fp16, fp16, No
+  // 2. fp32, fp32, fp32, No
+  // 3. fp16, fp32, fp32, Yes
+  // 4. fp32, fp32, fp32, Yes // this is the materialize_master_grads=True case
+  // It's easier to hardcode these possibilities than to use
+  // switches etc. to handle the cross-product of cases where
+  // we don't want the majority of them.
+
+  // Case 1. fp16, fp16, fp16, No
+  if(grad_type == at::ScalarType::Half &&
+     weight_type == at::ScalarType::Half &&
+     num_tensors == 3)
+  {
+    multi_tensor_apply<3>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        SGDFunctor<3, at::Half, at::Half>(),
+        wd,
+        momentum,
+        dampening,
+        lr,
+        nesterov,
+        first_run,
+        wd_after_momentum,
+        scale);
+  }
+  // Case 2. fp16, fp32, fp32, No
+  // else if (grad_type == at::ScalarType::Half &&
+  //          weight_type == at::ScalarType::Float &&
+  //          num_tensors == 3) {
+  //   multi_tensor_apply<3>(
+  //       BLOCK_SIZE,
+  //       chunk_size,
+  //       noop_flag,
+  //       tensor_lists,
+  //       SGDFunctor<3, at::Half, float>(),
+  //       wd,
+  //       momentum,
+  //       dampening,
+  //       lr,
+  //       nesterov,
+  //       first_run,
+  //       wd_after_momentum);
+  // }
+  // Case 2. fp32, fp32, fp32, No
+  else if(grad_type == at::ScalarType::Float &&
+          weight_type == at::ScalarType::Float &&
+          num_tensors == 3)
+  {
+    multi_tensor_apply<3>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        SGDFunctor<3, float, float>(),
+        wd,
+        momentum,
+        dampening,
+        lr,
+        nesterov,
+        first_run,
+        wd_after_momentum,
+        scale);
+  }
+  // Case 3. fp16, fp32, fp32, Yes
+  else if(grad_type == at::ScalarType::Half &&
+          weight_type == at::ScalarType::Float &&
+          num_tensors == 4)
+  {
+    multi_tensor_apply<4>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        SGDFunctor<4, at::Half, float>(),
+        wd,
+        momentum,
+        dampening,
+        lr,
+        nesterov,
+        first_run,
+        wd_after_momentum,
+        scale);
+  }
+  // Case 4. fp32, fp32, fp32, Yes
+  else if(grad_type == at::ScalarType::Float &&
+          weight_type == at::ScalarType::Float &&
+          num_tensors == 4)
+  {
+    multi_tensor_apply<4>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        SGDFunctor<4, float, float>(),
+        wd,
+        momentum,
+        dampening,
+        lr,
+        nesterov,
+        first_run,
+        wd_after_momentum,
+        scale);
+  }
+  else
+  {
+    AT_ERROR("multi_tensor_sgd only supports some combinations of gradient & weight types. Given: ",
+             "gradient: ", grad_type, ", weight: ", weight_type, ", num_lists: ", num_tensors);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
--- a/csrc/syncbn.cpp
+++ b/csrc/syncbn.cpp
@@ -55,10 +55,12 @@ std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input);
 // mean/inv_std have promoted data type (dtype==fp16?fp32:dtype)
 // expect data to be in n+c format (channel last) and applies CUDNN_BATCHNORM_SPATIAL
 at::Tensor batchnorm_forward_c_last_CUDA(const at::Tensor input,
+                                         const at::optional<at::Tensor> z,
                                         const at::Tensor mean,
                                         const at::Tensor inv_std,
                                         const at::optional<at::Tensor> weight,
-                                         const at::optional<at::Tensor> shift);
+                                         const at::optional<at::Tensor> shift,
+                                         const bool fuse_relu);

 // backward BN operation, returns {mean_dy, mean_dy_xmu, grad_weight, grad_bias}
 // grad_output/input should have identical data type;
@@ -82,6 +84,15 @@ at::Tensor batchnorm_backward_c_last_CUDA(const at::Tensor grad_output,
                                          const at::Tensor mean_dy,
                                          const at::Tensor mean_dy_xmu);

+at::Tensor relu_backward_c_last_CUDA(const at::Tensor grad_output,
+                                     const at::Tensor input,
+                                     const at::optional<at::Tensor> z,
+                                     const at::Tensor mean,
+                                     const at::Tensor inv_std,
+                                     const at::optional<at::Tensor> weight,
+                                     const at::optional<at::Tensor> shift);
+
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("welford_mean_var", &welford_mean_var_CUDA, "welford mean variance");
  m.def("welford_parallel", &welford_parallel_CUDA, "welford parallel reduce mean variance");
@@ -92,4 +103,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("batchnorm_forward_c_last", &batchnorm_forward_c_last_CUDA, "batchnorm forward nhwc");
  m.def("reduce_bn_c_last", &reduce_bn_c_last_CUDA, "batchnorm backwards reduce grad sum and bias/weight grad nhwc");
  m.def("batchnorm_backward_c_last", &batchnorm_backward_c_last_CUDA, "batchnorm backward dgrad nhwc");
+  m.def("relu_bw_c_last", &relu_backward_c_last_CUDA, "relu_bw_c_last");
 }
--- a/csrc/type_shim.h
+++ b/csrc/type_shim.h
@@ -128,3 +128,53 @@ __device__ __forceinline__ T reduce_block_into_lanes

  return final;
 }
+
+template<typename T>
+__device__ __forceinline__ T reduce_block_into_lanes_max_op
+  (T *x,
+   T val,
+   int lanes=1,
+   bool share_result=false) // lanes is intended to be <= 32.
+{
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1)
+  {
+    if(tid < i)
+      x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid+i]));
+    __syncthreads();
+  }
+
+  T final;
+
+  if(tid < 32)
+  {
+    if(blockSize >= 64)
+      final = fmaxf(fabsf(x[tid]), fabsf(x[tid+32]));
+    else
+      final = val;
+    // __SYNCWARP();
+
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
+  }
+
+  if(share_result)
+  {
+    if(tid < lanes)
+      x[tid] = final; // EpilogueOp
+    // Make sure the smem result is visible to all warps.
+    __syncthreads();
+  }
+
+  return final;
+}