Merge pull request #20 from lcskrishna/ifu_06052020

IFU_06_05_2020

Merge pull request #20 from lcskrishna/ifu_06052020
IFU_06_05_2020
37989915 · Ashish Farmer · GitHub · b0c7d09f · 097238f8 · 37989915
Unverified Commit 37989915 authored Jun 08, 2020 by Ashish Farmer Committed by GitHub Jun 08, 2020
15 changed files
--- a/apex/contrib/multihead_attn/encdec_multihead_attn.py
+++ b/apex/contrib/multihead_attn/encdec_multihead_attn.py
@@ -6,7 +6,12 @@ import torch.nn.functional as F
 from .encdec_multihead_attn_func               import encdec_attn_func
 from .fast_encdec_multihead_attn_func          import fast_encdec_attn_func
 from .fast_encdec_multihead_attn_norm_add_func import fast_encdec_attn_norm_add_func
+from apex.normalization.fused_layer_norm       import FusedLayerNorm

+if hasattr(torch._C, '_jit_set_profiling_executor') :
+    torch._C._jit_set_profiling_executor(False)
+if hasattr(torch._C, '_jit_set_profiling_mode') :
+    torch._C._jit_set_profiling_mode(False)

 @torch.jit.script
 def jit_dropout_add(x, residual, prob, is_training):
@@ -57,9 +62,9 @@ class EncdecMultiheadAttn(nn.Module):
                self.register_parameter('lyr_norm_beta_weights', None)
                self.lyr_nrm_gamma_weights = None
                self.lyr_nrm_beta_weights  = None
-                self.lyr_nrm = torch.nn.LayerNorm(embed_dim)
+                self.lyr_nrm = FusedLayerNorm(embed_dim)
        self.reset_parameters()
-        
+
        if self.include_norm_add:
            if   impl == 'fast'    : self.attn_func = fast_encdec_attn_norm_add_func
            elif impl == 'default' : self.attn_func = encdec_attn_func

--- a/apex/contrib/multihead_attn/encdec_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/encdec_multihead_attn_func.py
@@ -203,7 +203,7 @@ class EncdecAttnFunc(torch.autograd.Function):
        values_grads   = torch.bmm(dropout_results.transpose(1,2), output_lin_grads, out=values_grads.transpose(0,1))

        # Mask and Scaling for Dropout (not a publically documented op)
-        dropout_grads = torch._masked_scale(matmul2_dgrad1, dropout_mask, dropout_prob_t[0])
+        dropout_grads = torch._masked_scale(matmul2_dgrad1, dropout_mask, 1.0/(1.0-dropout_prob_t[0]))

        # Softmax Grad (not a publically documented op)
        softmax_grads = torch._softmax_backward_data(dropout_grads, softmax_results, -1, softmax_results)

--- a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
 import torch
 import fast_self_multihead_attn
-
+import fast_self_multihead_attn_bias
+import fast_self_multihead_attn_bias_additive_mask

 class FastSelfAttnFunc(torch.autograd.Function) :
    @staticmethod
-    def forward(ctx, use_time_mask, is_training, heads, inputs, input_weights, output_weights, pad_mask, dropout_prob):
+    def forward(ctx, use_time_mask, is_training, heads, inputs, input_weights, output_weights, input_biases, output_biases, pad_mask, mask_additive, dropout_prob):
+        use_biases_t   = torch.tensor([input_biases is not None])
        heads_t        = torch.tensor([heads])
        dropout_prob_t = torch.tensor([dropout_prob])
        null_tensor    = torch.tensor([])
        use_mask       = (pad_mask is not None)

-        input_lin_results,                                              \
-        softmax_results,                                                \
-        dropout_results,                                                \
-        dropout_mask,                                                   \
-        matmul2_results,                                                \
-        outputs =                                                       \
-            fast_self_multihead_attn.forward(                           \
-                              use_mask,                                 \
-                              use_time_mask,                            \
-                              is_training,                              \
-                              heads,                                    \
-                              inputs,                                   \
-                              input_weights,                            \
-                              output_weights,                           \
-                              pad_mask if use_mask else null_tensor,    \
-                              dropout_prob)
+        if use_biases_t[0]:
+            if not mask_additive:
+                input_lin_results,                                              \
+                softmax_results,                                                \
+                dropout_results,                                                \
+                dropout_mask,                                                   \
+                matmul2_results,                                                \
+                outputs =                                                       \
+                    fast_self_multihead_attn_bias.forward(                           \
+                                      use_mask,                                 \
+                                      use_time_mask,                            \
+                                      is_training,                              \
+                                      heads,                                    \
+                                      inputs,                                   \
+                                      input_weights,                            \
+                                      output_weights,                           \
+                                      input_biases,                           \
+                                      output_biases,                           \
+                                      pad_mask if use_mask else null_tensor,    \
+                                      dropout_prob)
+            else:
+                input_lin_results,                                              \
+                softmax_results,                                                \
+                dropout_results,                                                \
+                dropout_mask,                                                   \
+                matmul2_results,                                                \
+                outputs =                                                       \
+                    fast_self_multihead_attn_bias_additive_mask.forward(                           \
+                                      use_mask,                                 \
+                                      use_time_mask,                            \
+                                      is_training,                              \
+                                      heads,                                    \
+                                      inputs,                                   \
+                                      input_weights,                            \
+                                      output_weights,                           \
+                                      input_biases,                           \
+                                      output_biases,                           \
+                                      pad_mask if use_mask else null_tensor,    \
+                                      dropout_prob)
+

-        ctx.save_for_backward(heads_t,                                  \
+        else:
+            input_lin_results,                                              \
+            softmax_results,                                                \
+            dropout_results,                                                \
+            dropout_mask,                                                   \
+            matmul2_results,                                                \
+            outputs =                                                       \
+                fast_self_multihead_attn.forward(                           \
+                                  use_mask,                                 \
+                                  use_time_mask,                            \
+                                  is_training,                              \
+                                  heads,                                    \
+                                  inputs,                                   \
+                                  input_weights,                            \
+                                  output_weights,                           \
+                                  pad_mask if use_mask else null_tensor,    \
+                                  dropout_prob)
+
+        ctx.save_for_backward(use_biases_t,                                  \
+                              heads_t,                          \
                              matmul2_results,                          \
                              dropout_results,                          \
                              softmax_results,                          \
@@ -38,10 +83,12 @@ class FastSelfAttnFunc(torch.autograd.Function) :
                              dropout_mask,                             \
                              dropout_prob_t)

+
        return outputs.detach()

    @staticmethod
    def backward(ctx, output_grads):
+        use_biases_t,                                                        \
        heads_t,                                                        \
        matmul2_results,                                                \
        dropout_results,                                                \
@@ -53,22 +100,43 @@ class FastSelfAttnFunc(torch.autograd.Function) :
        dropout_mask,                                                   \
        dropout_prob_t      = ctx.saved_tensors

-        input_grads,                                                    \
-        input_weight_grads,                                             \
-        output_weight_grads =                                           \
-            fast_self_multihead_attn.backward(                          \
-                              heads_t[0],                               \
-                              output_grads,                             \
-                              matmul2_results,                          \
-                              dropout_results,                          \
-                              softmax_results,                          \
-                              input_lin_results,                        \
-                              inputs,                                   \
-                              input_weights,                            \
-                              output_weights,                           \
-                              dropout_mask,                             \
-                              dropout_prob_t[0])
+        if use_biases_t[0]:
+            input_grads,                                                    \
+            input_weight_grads,                                             \
+            output_weight_grads,                                           \
+            input_bias_grads,                                                   \
+            output_bias_grads =                                                    \
+                fast_self_multihead_attn_bias.backward(                          \
+                                  heads_t[0],                               \
+                                  output_grads,                             \
+                                  matmul2_results,                          \
+                                  dropout_results,                          \
+                                  softmax_results,                          \
+                                  input_lin_results,                        \
+                                  inputs,                                   \
+                                  input_weights,                            \
+                                  output_weights,                           \
+                                  dropout_mask,                             \
+                                  dropout_prob_t[0])

-        return None, None, None, input_grads, input_weight_grads, output_weight_grads, None, None
+        else:
+            input_bias_grads = None                                                    
+            output_bias_grads = None
+            input_grads,                                                    \
+            input_weight_grads,                                             \
+            output_weight_grads =                                           \
+                fast_self_multihead_attn.backward(                          \
+                                  heads_t[0],                               \
+                                  output_grads,                             \
+                                  matmul2_results,                          \
+                                  dropout_results,                          \
+                                  softmax_results,                          \
+                                  input_lin_results,                        \
+                                  inputs,                                   \
+                                  input_weights,                            \
+                                  output_weights,                           \
+                                  dropout_mask,                             \
+                                  dropout_prob_t[0])
+        return None, None, None, input_grads, input_weight_grads, output_weight_grads,input_bias_grads, output_bias_grads, None, None, None

 fast_self_attn_func = FastSelfAttnFunc.apply
--- a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
+++ b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
+import torch
+import fast_mask_softmax_dropout
+import fast_additive_mask_softmax_dropout
+
+
+class MaskSoftmaxDropout(torch.autograd.Function) :
+    @staticmethod
+    def forward(ctx, is_training, heads, inputs, pad_mask, mask_additive, dropout_prob):
+        heads_t        = torch.tensor([heads])
+        dropout_prob_t = torch.tensor([dropout_prob])
+        null_tensor    = torch.tensor([])
+        use_mask       = (pad_mask is not None)
+        use_mask_t     = torch.tensor([use_mask])
+        mask_additive_t     = torch.tensor([mask_additive])
+
+        if mask_additive:
+            dropout_results,                                                \
+            dropout_mask,                                                   \
+            softmax_results =                                                \
+                    fast_additive_mask_softmax_dropout.forward(                           \
+                                      use_mask,                                 \
+                                      is_training,                              \
+                                      heads,                                    \
+                                      inputs,                                   \
+                                      pad_mask if use_mask else null_tensor,    \
+                                      dropout_prob)
+        else:
+            dropout_results,                                                \
+            dropout_mask,                                                   \
+            softmax_results =                                                \
+                    fast_mask_softmax_dropout.forward(                           \
+                                      use_mask,                                 \
+                                      is_training,                              \
+                                      heads,                                    \
+                                      inputs,                                   \
+                                      pad_mask if use_mask else null_tensor,    \
+                                      dropout_prob)
+        
+        ctx.save_for_backward(
+                              use_mask_t,                                    \
+                              heads_t,                                 \
+                              softmax_results,                          \
+                              dropout_mask,                             \
+                              pad_mask if use_mask else null_tensor,        \
+                              mask_additive_t,        \
+                              dropout_prob_t)
+
+        return dropout_results.detach()
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        use_mask_t, \
+        heads_t,   \
+        softmax_results,                                                \
+        dropout_mask,                                              \
+        pad_mask,                                                   \
+        mask_additive_t,                                                   \
+        dropout_prob_t      = ctx.saved_tensors
+
+        if mask_additive_t[0]:
+            input_grads =                                                    \
+                fast_additive_mask_softmax_dropout.backward(                          \
+                                  use_mask_t[0],                             \
+                                  heads_t[0],                             \
+                                  output_grads,                             \
+                                  softmax_results,                          \
+                                  dropout_mask,                             \
+                                  dropout_prob_t[0])
+        else:
+            input_grads =                                                    \
+                fast_mask_softmax_dropout.backward(                          \
+                                  use_mask_t[0],                             \
+                                  heads_t[0],                             \
+                                  output_grads,                             \
+                                  softmax_results,                          \
+                                  dropout_mask,                             \
+                                  pad_mask,                             \
+                                  dropout_prob_t[0])
+        return None, None, input_grads, None, None, None
+
+fast_mask_softmax_dropout_func = MaskSoftmaxDropout.apply
--- a/apex/contrib/multihead_attn/self_multihead_attn.py
+++ b/apex/contrib/multihead_attn/self_multihead_attn.py
@@ -6,7 +6,12 @@ import torch.nn.functional as F
 from .self_multihead_attn_func               import self_attn_func
 from .fast_self_multihead_attn_func          import fast_self_attn_func
 from .fast_self_multihead_attn_norm_add_func import fast_self_attn_norm_add_func
+from apex.normalization.fused_layer_norm     import FusedLayerNorm

+if hasattr(torch._C, '_jit_set_profiling_executor') :
+    torch._C._jit_set_profiling_executor(False)
+if hasattr(torch._C, '_jit_set_profiling_mode') :
+    torch._C._jit_set_profiling_mode(False)

 @torch.jit.script
 def jit_dropout_add(x, residual, prob, is_training):
@@ -21,7 +26,7 @@ class SelfMultiheadAttn(nn.Module):

    See "Attention Is All You Need" for more details.
    """
-    def __init__(self, embed_dim, num_heads, dropout=0., bias=False, include_norm_add=False, impl='fast'):
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=False, include_norm_add=False, impl='fast', separate_qkv_params=False, mask_additive=False):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
@@ -32,17 +37,38 @@ class SelfMultiheadAttn(nn.Module):
        self.include_norm_add = include_norm_add
        self.impl = impl
        self.scaling = self.head_dim**-0.5
-
-        self.in_proj_weight  = Parameter(torch.Tensor(3*embed_dim, embed_dim))
+        self.separate_qkv_params = separate_qkv_params
+        self.mask_additive = mask_additive
+        if mask_additive:
+            assert self.include_norm_add == False, "additive mask not supported with layer norm"
+            assert impl == 'default' or (impl == 'fast' and bias), "additive mask not supported for fast mode without bias"
+        if separate_qkv_params:
+            self.q_weight  = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.k_weight  = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.v_weight  = Parameter(torch.Tensor(embed_dim, embed_dim))
+        else:
+            self.in_proj_weight  = Parameter(torch.Tensor(3*embed_dim, embed_dim))
        self.out_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
        if self.bias:
-            assert impl != 'fast', "ERROR! The Fast implementation does not support biases!"
-            self.in_proj_bias = Parameter(torch.Tensor(3*embed_dim))
+            if separate_qkv_params:
+                self.q_bias  = Parameter(torch.Tensor(embed_dim))
+                self.k_bias  = Parameter(torch.Tensor(embed_dim))
+                self.v_bias  = Parameter(torch.Tensor(embed_dim))
+            else:
+                self.in_proj_bias = Parameter(torch.Tensor(3*embed_dim))
            self.out_proj_bias = Parameter(torch.Tensor(embed_dim))
        else:
-            self.register_parameter('in_proj_bias', None)
+            if separate_qkv_params:
+                self.register_parameter('q_bias', None)
+                self.register_parameter('k_bias', None)
+                self.register_parameter('v_bias', None)
+                self.q_bias = None
+                self.k_bias = None
+                self.v_bias = None
+            else:
+                self.register_parameter('in_proj_bias', None)
+                self.in_proj_bias = None
            self.register_parameter('out_proj_bias', None)
-            self.in_proj_bias = None
            self.out_proj_bias = None
        if self.include_norm_add:
            if impl == 'fast':
@@ -54,7 +80,7 @@ class SelfMultiheadAttn(nn.Module):
                self.register_parameter('lyr_norm_beta_weights', None)
                self.lyr_nrm_gamma_weights = None
                self.lyr_nrm_beta_weights  = None
-                self.lyr_nrm = torch.nn.LayerNorm(embed_dim)
+                self.lyr_nrm = FusedLayerNorm(embed_dim)
        self.reset_parameters()

        if self.include_norm_add:
@@ -67,10 +93,20 @@ class SelfMultiheadAttn(nn.Module):
            else :                   assert False, "Unsupported impl: {} !".format(impl)

    def reset_parameters(self):
-        nn.init.xavier_uniform_(self.in_proj_weight)
+        if self.separate_qkv_params:
+            nn.init.xavier_uniform_(self.q_weight)
+            nn.init.xavier_uniform_(self.k_weight)
+            nn.init.xavier_uniform_(self.v_weight)
+        else:
+            nn.init.xavier_uniform_(self.in_proj_weight)
        nn.init.xavier_uniform_(self.out_proj_weight)
        if self.bias:
-            nn.init.constant_(self.in_proj_bias, 0.)
+            if self.separate_qkv_params:
+                nn.init.constant_(self.q_bias, 0.)
+                nn.init.constant_(self.k_bias, 0.)
+                nn.init.constant_(self.v_bias, 0.)
+            else:
+                nn.init.constant_(self.in_proj_bias, 0.)
            nn.init.constant_(self.out_proj_bias, 0.)
        if self.include_norm_add:
            if self.impl == 'fast':
@@ -88,10 +124,22 @@ class SelfMultiheadAttn(nn.Module):
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """
+        if self.separate_qkv_params:
+            input_weights = torch.cat([self.q_weight.view(self.num_heads,1,self.head_dim,self.embed_dim), self.k_weight.view(self.num_heads,1,self.head_dim,self.embed_dim), self.v_weight.view(self.num_heads,1,self.head_dim,self.embed_dim)], dim=1).reshape(3*self.embed_dim,self.embed_dim).contiguous()
+        else: 
+            input_weights = self.in_proj_weight
+        if self.bias:
+            if self.separate_qkv_params:
+                input_bias = torch.cat([self.q_bias.view(self.num_heads,1,self.head_dim), self.k_bias.view(self.num_heads,1,self.head_dim), self.v_bias.view(self.num_heads,1,self.head_dim)],dim=1).reshape(3*self.embed_dim).contiguous()
+            else:
+                input_bias = self.in_proj_bias
+        else:
+            input_bias=None        
        if key_padding_mask is not None:
            assert (attn_mask is None), "ERROR attn_mask and key_padding_mask should not be both defined!"
            mask = key_padding_mask
        elif attn_mask is not None:
+            assert self.mask_additive == False, "additive mask not supported for time mask"
            mask = attn_mask
        else:
            mask = None
@@ -100,12 +148,12 @@ class SelfMultiheadAttn(nn.Module):
            if self.impl == 'fast':
                outputs = self.attn_func(attn_mask is not None, is_training, self.num_heads, query,
                                         self.lyr_nrm_gamma_weights, self.lyr_nrm_beta_weights,
-                                         self.in_proj_weight, self.out_proj_weight, mask, self.dropout)
+                                         input_weights, self.out_proj_weight, mask, self.dropout)
            else:
                lyr_nrm_results = self.lyr_nrm(query)
                outputs = self.attn_func(attn_mask is not None, is_training, self.num_heads, self.scaling, lyr_nrm_results,
-                                         self.in_proj_weight, self.out_proj_weight,
-                                         self.in_proj_bias, self.out_proj_bias,
+                                         input_weights, self.out_proj_weight,
+                                         input_bias, self.out_proj_bias,
                                         mask, self.dropout)
                if is_training:
                    outputs = jit_dropout_add(outputs, query, self.dropout, is_training)
@@ -114,11 +162,11 @@ class SelfMultiheadAttn(nn.Module):
        else:
            if self.impl == 'fast':
                outputs = self.attn_func(attn_mask is not None, is_training, self.num_heads, query,
-                                         self.in_proj_weight, self.out_proj_weight, mask, self.dropout)
+                                         input_weights, self.out_proj_weight, input_bias, self.out_proj_bias, mask, self.mask_additive, self.dropout)
            else:
                outputs = self.attn_func(attn_mask is not None, is_training, self.num_heads, self.scaling, query,
-                                         self.in_proj_weight, self.out_proj_weight,
-                                         self.in_proj_bias, self.out_proj_bias,
-                                         mask, self.dropout)
+                                         input_weights, self.out_proj_weight,
+                                         input_bias, self.out_proj_bias,
+                                         mask, self.mask_additive, self.dropout)

        return outputs,None
--- a/apex/contrib/optimizers/distributed_fused_adam.py
+++ b/apex/contrib/optimizers/distributed_fused_adam.py
@@ -264,9 +264,11 @@ class DistributedFusedAdam(torch.optim.Optimizer):
                grp = torch.distributed.new_group(ranks=ranks)
                if torch.distributed.get_rank() in ranks:
                    self._rs_pg.append(grp)
-            if self._compute_L2_grad_norm and torch.distributed.get_rank() in ranks:
-                self._l2_grad_norm_pg = torch.distributed.new_group(ranks=ranks)
-                torch.distributed.all_reduce(self._overflow_buf,group=self._l2_grad_norm_pg)
+            if self._compute_L2_grad_norm:
+                l2_grad_norm_pg = torch.distributed.new_group(ranks=ranks)
+                if torch.distributed.get_rank() in ranks:
+                    self._l2_grad_norm_pg = l2_grad_norm_pg
+                    torch.distributed.all_reduce(self._overflow_buf,group=self._l2_grad_norm_pg)
        self._rs_st = [torch.cuda.Stream() for _ in range(self._num_rs_pg)]
        for rs_pg in self._rs_pg:
            torch.distributed.all_reduce(self._overflow_buf,group=rs_pg)

--- a/apex/contrib/optimizers/distributed_fused_lamb.py
+++ b/apex/contrib/optimizers/distributed_fused_lamb.py
+import math
+import torch
+import importlib
+import amp_C
+from apex.multi_tensor_apply import multi_tensor_applier
+
+class DistributedFusedLAMB(torch.optim.Optimizer):
+
+    """Implements LAMB algorithm.
+
+    Currently GPU-only.  Requires Apex to be installed via
+    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.
+
+    This version of fused LAMB implements 2 fusions.
+
+      * Fusion of the LAMB update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+
+    :class:`apex.optimizers.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer::
+
+        opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
+        ...
+        opt.step()
+
+    :class:`apex.optimizers.FusedLAMB` may be used with or without Amp.  If you wish to use :class:`FusedLAMB` with Amp,
+    you may choose any ``opt_level``::
+
+        opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
+        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
+        ...
+        opt.step()
+
+    In general, ``opt_level="O1"`` is recommended.
+
+    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its norm. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            NOT SUPPORTED now! (default: False)
+        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
+            True for decoupled weight decay(also known as AdamW) (default: True)
+        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
+            calculating running averages of gradient. (default: True)
+        set_grad_none (bool, optional): whether set grad to None when zero_grad()
+            method is called. (default: True)
+        max_grad_norm (float, optional): value used to clip global grad norm
+            (default: 1.0)
+        use_nvlamb (boolean, optional): Apply adaptive learning rate to 0.0
+            weight decay parameter (default: False)
+
+    .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params,
+                 lr=1e-3, bias_correction = True, grad_averaging=True,
+                 betas=(0.9, 0.999), eps=1e-8, 
+                 weight_decay=0., max_grad_norm=0., 
+                 adam_w_mode=True, use_nvlamb=False, 
+                 amp_scale_adjustment=1.0, overlap_reductions=True, 
+                 dwu_group_size=0, dwu_num_blocks=4, dwu_num_chunks=4,
+                 dwu_num_rs_pg=1, dwu_num_ar_pg=4, dwu_num_ag_pg=0, 
+                 e5m2_allgather=False):
+        defaults = dict(lr=lr, bias_correction=bias_correction,
+                        betas=betas, eps=eps, weight_decay=weight_decay,
+                        grad_averaging=grad_averaging,
+                        max_grad_norm=max_grad_norm)
+
+        super(DistributedFusedLAMB, self).__init__(params, defaults)
+
+        self._init_args = {
+                'lr': lr,
+                'bias_correction': bias_correction,
+                'grad_averaging': grad_averaging,
+                'betas': betas,
+                'eps': eps,
+                'weight_decay': weight_decay,
+                'max_grad_norm': max_grad_norm,
+                'adam_w_mode': adam_w_mode,
+                'use_nvlamb': use_nvlamb,
+                'amp_scale_adjustment': amp_scale_adjustment,
+                'overlap_reductions': overlap_reductions,
+                'dwu_group_size': dwu_group_size,
+                'dwu_num_blocks': dwu_num_blocks,
+                'dwu_num_chunks': dwu_num_chunks,
+                'dwu_num_rs_pg': dwu_num_rs_pg,
+                'dwu_num_ar_pg': dwu_num_ar_pg,
+                'dwu_num_ag_pg': dwu_num_ag_pg,
+                'e5m2_allgather': e5m2_allgather}
+        self._init_done = False
+
+        import inspect
+        assert ('no_copy' in inspect.getfullargspec(torch.distributed.reduce_scatter).args), "This version of c10d does not support no_copy option"
+
+    def __first_step_init__(self,
+                 lr=1e-3, bias_correction = True, grad_averaging=True,
+                 betas=(0.9, 0.999), eps=1e-8, 
+                 weight_decay=0., max_grad_norm=0., 
+                 adam_w_mode=True, use_nvlamb=False, 
+                 amp_scale_adjustment=1.0, overlap_reductions=True, 
+                 dwu_group_size=0, dwu_num_blocks=4, dwu_num_chunks=4,
+                 dwu_num_rs_pg=1, dwu_num_ar_pg=4, dwu_num_ag_pg=0, 
+                 e5m2_allgather=False):
+        global fused_adam_cuda, distributed_lamb_cuda
+        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+        distributed_lamb_cuda = importlib.import_module("distributed_lamb_cuda")
+
+        self._amp_scale_adjustment = amp_scale_adjustment
+
+        self._overflow_buf = torch.cuda.IntTensor([0])
+        self._has_overflow = False
+        self.multi_tensor_lamb_compute_update_term = distributed_lamb_cuda.multi_tensor_lamb_compute_update_term
+        self.multi_tensor_lamb_update_weights = distributed_lamb_cuda.multi_tensor_lamb_update_weights
+        import amp_C
+        self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
+
+        self._adam_w_mode = 1 if adam_w_mode else 0
+        self._use_nvlamb = use_nvlamb
+        self._is_accumulation_step = False
+        self._last_step = False
+        self._overlap_reductions = overlap_reductions
+        self._num_blocks = dwu_num_blocks
+        self._num_chunks = dwu_num_chunks
+        self._e5m2_allgather = e5m2_allgather
+        self._L2_grad_norm = None
+        self._group_size = torch.cuda.device_count() if dwu_group_size <= 0 else dwu_group_size
+        self._world_size = torch.distributed.get_world_size()
+        self._num_groups = self._world_size // self._group_size
+        self._rank_in_group = torch.distributed.get_rank() % self._group_size
+
+        p_offset = 0
+        p_i = 0
+        self._model_params = []
+        self._grads_info = []
+        self._grad_accs = []
+        self._group_properties = []
+        for group in self.param_groups:
+            prev = None
+            beta1, beta2 = group['betas']
+            for p in group['params']:
+                torch.distributed.broadcast(p,0)
+                if not p.requires_grad:
+                    continue
+                self._model_params.append(p)
+                self._group_properties.append((
+                    group['weight_decay'],
+                    1 if group['bias_correction'] else 0,
+                    beta1,
+                    beta2,
+                    1.0 - beta1 if grad_averaging else 1.0,
+                    group['eps']
+                    ))
+                p_grads_size = p.numel()
+                def wrapper(param, param_i, param_grads_size, param_offset):
+                    param_tmp = param.expand_as(param)
+                    grad_acc = param_tmp.grad_fn.next_functions[0][0]
+                    def allreduce_hook(*unused):
+                        self._do_overlapped_reduction(param_i, param_grads_size, param_offset, param)
+                    grad_acc.register_hook(allreduce_hook)
+                    self._grad_accs.append(grad_acc)
+                self._grads_info.append({"param_grads_size":p_grads_size, "param_offset":p_offset})
+                wrapper(p, p_i, p_grads_size, p_offset)
+                p_offset += p_grads_size
+                # Only enforce 128b alignment (64 * fp16) for non-consecutive parameters
+                # RNN is one example of consecutive parameters:
+                # (weight_ih, weight_hh, bias_ih, bias_hh)
+                if prev is not None and (prev.data_ptr() + prev.numel() * prev.element_size() != p.data_ptr()):
+                    p_offset = ((p_offset + 63) // 64) * 64
+                prev = p
+                p_i += 1
+        self._grads_generated = [False]*len(self._grads_info)
+        self._grads_fp16, self._grads_fp32 = [], []
+        if self._overlap_reductions:
+            self._current_block = self._num_blocks
+
+        self._net_total_param_size = p_offset
+        self._total_param_size = p_offset
+        dwu_min_page_size = 256 * self._num_blocks * self._num_chunks * self._group_size
+        self._total_param_size = ((self._total_param_size + dwu_min_page_size - 1) // dwu_min_page_size) * dwu_min_page_size
+        self._block_size = self._total_param_size // self._num_blocks
+        self._chunk_size = self._block_size // self._num_chunks
+        self._shard_size = self._chunk_size // self._group_size
+        print("self._net_total_param_size=%d, self._total_param_size=%d, dwu_min_page_size=%d, self._block_size=%d, self._chunk_size=%d, self._shard_size=%d" % (self._net_total_param_size, self._total_param_size,dwu_min_page_size,self._block_size,self._chunk_size,self._shard_size))
+
+        self._low_param_i = [0]*self._num_blocks
+        for block_id in range(self._num_blocks-1,-1,-1):
+            p_i = len(self._grads_info)-1
+            while p_i > 0 and self._grads_info[p_i]["param_offset"] > block_id*self._block_size:
+                p_i -= 1
+            self._low_param_i[block_id] = p_i
+        print(self._low_param_i)
+
+        self._flat_grads = torch.zeros([self._total_param_size], dtype=torch.float16, device='cuda')
+        self._new_params = torch.zeros([self._total_param_size], dtype=torch.uint8 if self._e5m2_allgather else torch.float16, device='cuda')
+        self._mega_shard_size = self._num_blocks * self._num_chunks * self._shard_size
+        self._fp32_p = torch.zeros([self._mega_shard_size], dtype=torch.float32, device='cuda')
+        self._fp32_m = torch.zeros([self._mega_shard_size], dtype=torch.float32, device='cuda')
+        self._fp32_v = torch.zeros([self._mega_shard_size], dtype=torch.float32, device='cuda')
+        self._fp32_u = torch.zeros([self._mega_shard_size], dtype=torch.float32, device='cuda')
+        # FIXME: Rethink fp16 label since it's either uint8 or fp16
+        self._fp16_p = torch.zeros([self._mega_shard_size], dtype=torch.uint8 if self._e5m2_allgather else torch.float16, device='cuda')
+        self._fp16_g = torch.zeros([self._mega_shard_size], dtype=torch.float16, device='cuda')
+
+        self._individual_flat_grads = []
+        for p_i, (grads_info, p) in enumerate(zip(self._grads_info, self._model_params)):
+            self._individual_flat_grads.append(self._flat_grads[grads_info["param_offset"]:grads_info["param_offset"]+grads_info["param_grads_size"]].view_as(p))
+
+        def _flat_split(p):
+            def __blockify(p):
+                return [p[block_id*self._block_size:(block_id+1)*self._block_size] for block_id in range(self._num_blocks)]
+            def __chunkify(p):
+                return [p[chunk_id*self._chunk_size:(chunk_id+1)*self._chunk_size] for chunk_id in range(self._num_chunks)]
+            def __shardify(p):
+                return [p[shard_id*self._shard_size:(shard_id+1)*self._shard_size] for shard_id in range(self._group_size)]
+            list_of_blocks = __blockify(self._flat_grads)
+            list_of_list_of_chunks = [__chunkify(block) for block in list_of_blocks]
+            list_of_list_of_list_of_shards = [[__shardify(chunk) for chunk in chunks] for chunks in list_of_list_of_chunks]
+            return list_of_blocks, list_of_list_of_chunks, list_of_list_of_list_of_shards
+        self._flat_grads_blocks, self._flat_grads_chunks, self._flat_grads_shards = _flat_split(self._flat_grads)
+        def _full_packed_split(p):
+            def __shardify(p):
+                return [p[mega_shard*self._mega_shard_size:(mega_shard+1)*self._mega_shard_size] for mega_shard in range(self._group_size)]
+            def __blockify(p):
+                return [p[block_id*self._num_chunks*self._shard_size:(block_id+1)*self._num_chunks*self._shard_size] for block_id in range(self._num_blocks)]
+            def __chunkify(p):
+                return [p[chunk_id*self._shard_size:(chunk_id+1)*self._shard_size] for chunk_id in range(self._num_chunks)]
+            list_of_mega_shards = __shardify(p)
+            list_of_list_of_mega_blocks = [__blockify(mega_shard) for mega_shard in list_of_mega_shards]
+            list_of_list_of_list_of_mega_chunks = [[__chunkify(mega_block) for mega_block in mega_blocks] for mega_blocks in list_of_list_of_mega_blocks]
+            return list_of_mega_shards, list_of_list_of_mega_blocks, list_of_list_of_list_of_mega_chunks
+        self._new_params_mega_shards, self._new_params_mega_blocks, self._new_params_mega_chunks = _full_packed_split(self._new_params)
+        def _packed_split(p):
+            def __packed_blockify(p):
+                packed_block_size = self._num_chunks*self._shard_size
+                return [p[block_id*packed_block_size:(block_id+1)*packed_block_size] for block_id in range(self._num_blocks)]
+            def __packed_chunkify(p):
+                # in the packed format, each chunk contains one shard, so packed_chunk_size == self._shard_size
+                return [p[chunk_id*self._shard_size:(chunk_id+1)*self._shard_size] for chunk_id in range(self._num_chunks)]
+            list_of_blocks = __packed_blockify(p)
+            list_of_list_of_chunks = [__packed_chunkify(block) for block in list_of_blocks]
+            return list_of_blocks, list_of_list_of_chunks
+        self._fp32_p_blocks, self._fp32_p_chunks = _packed_split(self._fp32_p)
+        self._fp32_m_blocks, self._fp32_m_chunks = _packed_split(self._fp32_m)
+        self._fp32_v_blocks, self._fp32_v_chunks = _packed_split(self._fp32_v)
+        self._fp32_u_blocks, self._fp32_u_chunks = _packed_split(self._fp32_u)
+        self._fp16_p_blocks, self._fp16_p_chunks = _packed_split(self._fp16_p)
+        self._fp16_g_blocks, self._fp16_g_chunks = _packed_split(self._fp16_g)
+
+        # This paragraph does two things:
+        # 1) Copy model parameters into master buffer
+        # 2) Create tensor lists for unpacking new parameter tensor after all-gather
+        self._packed_flat_to_model_params_fp16 = []
+        self._packed_flat_to_model_params_fp32 = []
+        self._model_params_num = len(self._model_params)
+        self._contrib_tensor_list = []
+        self._contrib_min_param_i, self._contrib_max_param_i = -1, -1
+        self._contrib_update_frag_for_norm = []
+        self._contrib_model_param_for_norm_fp16 = []
+        self._contrib_model_param_for_norm_fp32 = []
+        self._contrib_model_param_for_norm_is_fp16 = []
+        self._model_param_is_contrib = [False]*self._model_params_num
+        self._contrib_group_properties = []
+        for shard_id in range(self._group_size):
+            for block_id in range(self._num_blocks):
+                for chunk_id in range(self._num_chunks):
+                    flat_shard_start = (((block_id * self._num_chunks + chunk_id) * self._group_size) + shard_id) * self._shard_size
+                    flat_shard_end = flat_shard_start + self._shard_size
+                    for param_i, (p, grads_info, group_props) in enumerate(zip(self._model_params, self._grads_info, self._group_properties)):
+                        flat_grad_start = grads_info["param_offset"]
+                        flat_grad_end = flat_grad_start + grads_info["param_grads_size"]
+                        clipped_start = (lambda a,b: a if a > b else b)(flat_grad_start, flat_shard_start)
+                        clipped_end = (lambda a,b: a if a < b else b)(flat_grad_end, flat_shard_end)
+                        if clipped_start < clipped_end:
+                            grad_offset = clipped_start - flat_grad_start
+                            grad_length = clipped_end - clipped_start
+                            shard_offset = clipped_start - flat_shard_start
+                            model_param_fragment = p.view(-1)[grad_offset:grad_offset+grad_length]
+                            new_param_packed_fragment = self._new_params_mega_chunks[shard_id][block_id][chunk_id][shard_offset:shard_offset+grad_length]
+                            if model_param_fragment.dtype == torch.float16:
+                                self._packed_flat_to_model_params_fp16.append( (new_param_packed_fragment, model_param_fragment) )
+                            else:
+                                self._packed_flat_to_model_params_fp32.append( (new_param_packed_fragment, model_param_fragment) )
+                            if shard_id == self._rank_in_group:
+                                self._model_param_is_contrib[param_i] = True
+                                # copy model parameters into master buffer
+                                master_param_fragment = self._fp32_p_chunks[block_id][chunk_id][shard_offset:shard_offset+grad_length]
+                                opti_state_m_fragment = self._fp32_m_chunks[block_id][chunk_id][shard_offset:shard_offset+grad_length]
+                                opti_state_v_fragment = self._fp32_v_chunks[block_id][chunk_id][shard_offset:shard_offset+grad_length]
+                                opti_state_u_fragment = self._fp32_u_chunks[block_id][chunk_id][shard_offset:shard_offset+grad_length]
+                                opti_state_g_fragment = self._fp16_g_chunks[block_id][chunk_id][shard_offset:shard_offset+grad_length]
+                                opti_state_p_fragment = self._fp16_p_chunks[block_id][chunk_id][shard_offset:shard_offset+grad_length]
+                                #print("model_param_fragment.size()=%s, new_param_packed_fragment.size()=%s, master_param_fragment.size()=%s" % (str(model_param_fragment.size()), str(new_param_packed_fragment.size()), str(master_param_fragment.size())))
+                                master_param_fragment.copy_(model_param_fragment)
+                                self._contrib_group_properties.append(group_props)
+                                self._contrib_tensor_list.append((master_param_fragment, opti_state_m_fragment, opti_state_v_fragment, opti_state_u_fragment, opti_state_g_fragment, opti_state_p_fragment)) # p, m, v, u, g, p_copy
+                                self._contrib_update_frag_for_norm.append(opti_state_u_fragment)
+                                if p.dtype == torch.float16:
+                                    self._contrib_model_param_for_norm_fp16.append(p)
+                                else:
+                                    self._contrib_model_param_for_norm_fp32.append(p)
+                                self._contrib_model_param_for_norm_is_fp16.append(True if p.dtype == torch.float16 else False)
+                                if self._contrib_min_param_i < 0: self._contrib_min_param_i = param_i
+                                self._contrib_max_param_i = param_i
+        self._contrib_model_param_for_norm_num = len(self._contrib_model_param_for_norm_is_fp16)
+        if len(self._contrib_model_param_for_norm_fp16) == 0: self._contrib_model_param_for_norm_fp16 = None
+        if len(self._contrib_model_param_for_norm_fp32) == 0: self._contrib_model_param_for_norm_fp32 = None
+        self._contrib_model_param_for_norm_is_fp32 = torch.tensor([not is_fp16 for is_fp16 in self._contrib_model_param_for_norm_is_fp16], dtype=torch.bool, device='cuda')
+        self._contrib_model_param_for_norm_is_fp16 = torch.tensor([is_fp16 for is_fp16 in self._contrib_model_param_for_norm_is_fp16], dtype=torch.bool, device='cuda')
+        self._model_param_is_contrib = torch.tensor(self._model_param_is_contrib, dtype=torch.bool, device='cuda')
+
+        p, m, v, u, g, p_copy = list(zip(*self._contrib_tensor_list))
+        self._contrib_compute_update_term_tensor_list = [g, p, m, v, u]
+        self._contrib_update_weights_tensor_list = [u, p, p_copy]
+
+        math_type = self._fp32_u.dtype
+        decay, bias_correction, beta1, beta2, beta3, epsilon = list(zip(*self._contrib_group_properties))
+        self._contrib_beta1 = torch.tensor(beta1, dtype=math_type, device='cuda')
+        self._contrib_beta2 = torch.tensor(beta2, dtype=math_type, device='cuda')
+        self._contrib_beta3 = torch.tensor(beta3, dtype=math_type, device='cuda')
+        self._contrib_bias_correction = torch.tensor(bias_correction, dtype=torch.int, device='cuda')
+        self._contrib_epsilon = torch.tensor(epsilon, dtype=math_type, device='cuda')
+        self._contrib_weight_decay = torch.tensor(decay, dtype=math_type, device='cuda')
+
+        self._packed_flat_to_model_params_fp16 = list(zip(*self._packed_flat_to_model_params_fp16)) if len(self._packed_flat_to_model_params_fp16) > 0 else None
+        self._packed_flat_to_model_params_fp32 = list(zip(*self._packed_flat_to_model_params_fp32)) if len(self._packed_flat_to_model_params_fp32) > 0 else None
+
+        self._num_rs_pg = dwu_num_rs_pg
+        self._num_ar_pg = dwu_num_ar_pg
+        self._num_ag_pg = dwu_num_ag_pg
+        if self._num_groups > 1:
+            self._ar_pg = []
+            for dev_i in range(self._group_size):
+                ranks = [dev_i+j*self._group_size for j in range(self._num_groups)]
+                for i in range(self._num_ar_pg):
+                    grp = torch.distributed.new_group(ranks=ranks)
+                    if torch.distributed.get_rank() in ranks:
+                        self._ar_pg.append(grp)
+            self._ar_st = [torch.cuda.Stream() for _ in range(self._num_ar_pg)]
+            for ar_pg in self._ar_pg:
+                torch.distributed.all_reduce(self._overflow_buf,group=ar_pg)
+        rs_ranks = []
+        for group_i in range(self._num_groups):
+            rs_ranks.append([group_i*self._group_size+j for j in range(self._group_size)])
+        self._rs_pg = []
+        for group_i in range(self._num_groups):
+            ranks = rs_ranks[group_i]
+            for i in range(self._num_rs_pg):
+                grp = torch.distributed.new_group(ranks=ranks)
+                if torch.distributed.get_rank() in ranks:
+                    self._rs_pg.append(grp)
+            l2_grad_norm_pg = torch.distributed.new_group(ranks=ranks)
+            if torch.distributed.get_rank() in ranks:
+                self._l2_grad_norm_pg = l2_grad_norm_pg
+                torch.distributed.all_reduce(self._overflow_buf,group=self._l2_grad_norm_pg)
+        self._rs_st = [torch.cuda.Stream() for _ in range(self._num_rs_pg)]
+        for rs_pg in self._rs_pg:
+            torch.distributed.all_reduce(self._overflow_buf,group=rs_pg)
+        if self._num_ag_pg == 0:
+            self._ag_pg = self._rs_pg
+            self._ag_st = self._rs_st
+            self._num_ag_pg = self._num_rs_pg
+        else:
+            self._ag_pg = []
+            for group_i in range(self._num_groups):
+                ranks = rs_ranks[group_i]
+                for i in range(self._num_ag_pg):
+                    grp = torch.distributed.new_group(ranks=ranks)
+                    if torch.distributed.get_rank() in ranks:
+                        self._ag_pg.append(grp)
+            self._ag_st = [torch.cuda.Stream() for _ in range(self._num_ag_pg)]
+            for ag_pg in self._ag_pg:
+                torch.distributed.all_reduce(self._overflow_buf,group=ag_pg)
+        self._l2_grad_norm_st = torch.cuda.Stream()
+        self._completion_st = torch.cuda.Stream()
+
+        self._reductions_works = [None]*self._num_blocks
+        self._allgather_works = [None]*self._num_blocks
+
+    def _init_everything(self):
+        if not self._init_done:
+            self.__first_step_init__(**self._init_args)
+            self._init_done = True
+
+    def set_is_accumulation_step(self, is_accumulation_step):
+        self._is_accumulation_step = is_accumulation_step
+
+    def set_last_step(self, last_step):
+        self._last_step = last_step
+        
+    def _get_flush_block(self):
+        flush_block = []
+        if self._current_block > 0 and self._grads_generated[self._low_param_i[self._current_block-1]]:
+            num_grads = len(self._grads_generated)
+            contiguous_idx = num_grads
+            while contiguous_idx > 0 and self._grads_generated[contiguous_idx-1]:
+                contiguous_idx -= 1
+
+            if contiguous_idx < num_grads and self._grads_info[contiguous_idx]["param_offset"] <= (self._current_block-1)*self._block_size:
+                self._current_block -= 1
+                start = self._current_block * self._block_size
+                end = (self._current_block+1) * self._block_size
+                flush_block = [start, end]
+
+        return flush_block
+
+    def _pipeline_block_reductions(self, block_id):
+        self._flatten_grad_mt(1.0/self._world_size)
+
+        # Reduction within each node
+        # Changes gradient format from [block * chunk * shard] to [shard * block * chunk]
+        # The output format is the same as the fp32 master parameters
+        works = [None]*self._num_chunks
+        for chunk_id in range(self._num_chunks):
+            glob_chunk_id = block_id * self._num_chunks + chunk_id
+            rs_stream = self._rs_st[glob_chunk_id%self._num_rs_pg]
+            rs_stream.wait_stream(torch.cuda.current_stream())
+            with torch.cuda.stream(rs_stream):
+                works[chunk_id] = torch.distributed.reduce_scatter(self._fp16_g_chunks[block_id][chunk_id],self._flat_grads_shards[block_id][chunk_id],group=self._rs_pg[glob_chunk_id%self._num_rs_pg],async_op=True,no_copy=True)
+
+        # Reduction across nodes for each rank
+        if self._num_groups > 1:
+            for chunk_id in range(self._num_chunks):
+                glob_chunk_id = block_id * self._num_chunks + chunk_id
+                ar_stream = self._ar_st[glob_chunk_id%self._num_ar_pg]
+                with torch.cuda.stream(ar_stream):
+                    works[chunk_id].wait()
+                    works[chunk_id] = torch.distributed.all_reduce(self._fp16_g_chunks[block_id][chunk_id],group=self._ar_pg[glob_chunk_id%self._num_ar_pg],async_op=True)
+        self._reductions_works[block_id] = works
+
+        # Compute L2 grad norm
+        if block_id == 0:
+            with torch.cuda.stream(self._l2_grad_norm_st):
+                for block_id in range(self._num_blocks):
+                    for chunk_id in range(self._num_chunks):
+                        self._reductions_works[block_id][chunk_id].wait()
+                # Since the packed format is contiguous after reductions, only one norm is needed
+                l2_grad_norm_sq = torch.empty([1], device='cuda')
+                l2_grad_norm_sq = self._fp16_g.norm(dtype=torch.float32, p=2)**2
+                torch.distributed.all_reduce(l2_grad_norm_sq, group=self._l2_grad_norm_pg)
+                self._L2_grad_norm = l2_grad_norm_sq.sqrt().item()
+
+    def __compute_contrib_param_norm(self):
+        if self._contrib_model_param_for_norm_fp16 is not None and self._contrib_model_param_for_norm_fp32 is not None:
+            gnorm_fp16 = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_model_param_for_norm_fp16], True)[1]
+            gnorm_fp32 = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_model_param_for_norm_fp32], True)[1]
+            gnorm = torch.empty(size=[self._contrib_model_param_for_norm_num], dtype=torch.bool, device='cuda')
+            gnorm.masked_scatter_(self._contrib_model_param_for_norm_is_fp16, gnorm_fp16)
+            gnorm.masked_scatter_(self._contrib_model_param_for_norm_is_fp32, gnorm_fp32)
+        elif self._contrib_model_param_for_norm_fp16 is not None:
+            gnorm = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_model_param_for_norm_fp16], True)[1]
+        elif self._contrib_model_param_for_norm_fp32 is not None:
+            gnorm = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_model_param_for_norm_fp32], True)[1]
+        return gnorm
+
+    def __compute_contrib_update_norm(self):
+        l2_norm = torch.zeros(size=[self._model_params_num], dtype=torch.float32, device='cuda')
+        local_contrib_l2_norm = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [self._contrib_update_frag_for_norm], True)[1] ** 2
+        l2_norm.masked_scatter_(self._model_param_is_contrib, local_contrib_l2_norm)
+        torch.distributed.all_reduce(l2_norm, group=self._ag_pg[0])
+        return l2_norm.masked_select(self._model_param_is_contrib)
+
+    def _pipeline_step(self):
+        # Call step kernel once per step
+        # Call all-gather once per step
+        with torch.cuda.stream(self._completion_st):
+            for block_id in range(self._num_blocks):
+                for chunk_id in range(self._num_chunks):
+                    self._reductions_works[block_id][chunk_id].wait()
+            param_norm = self.__compute_contrib_param_norm()
+            max_grad_norm = self.defaults['max_grad_norm']
+            multi_tensor_applier(self.multi_tensor_lamb_compute_update_term,
+                    self._overflow_buf,
+                    self._contrib_compute_update_term_tensor_list, # g, p, m, v, u
+                    self._contrib_beta1,
+                    self._contrib_beta2,
+                    self._contrib_beta3,
+                    self._contrib_bias_correction,
+                    self.param_groups[0]['step'],
+                    self._contrib_epsilon,
+                    self._adam_w_mode,
+                    self._contrib_weight_decay,
+                    self.L2_grad_norm,
+                    max_grad_norm)
+            upd_norm = self.__compute_contrib_update_norm()
+            multi_tensor_applier(self.multi_tensor_lamb_update_weights,
+                    self._overflow_buf,
+                    self._contrib_update_weights_tensor_list, # u, p, p_copy
+                    param_norm,
+                    upd_norm,
+                    self.param_groups[0]['lr'],
+                    self._contrib_weight_decay,
+                    self._use_nvlamb)
+            torch.distributed.all_gather(self._new_params_mega_shards, self._fp16_p, group=self._ag_pg[0], no_copy=True)
+
+    def _flatten_grad_mt(self, scale):
+        if len(self._grads_fp16) > 0:
+            self._overflow_buf.zero_()
+            multi_tensor_applier(
+                    amp_C.multi_tensor_scale,
+                    self._overflow_buf,
+                    list(zip(*self._grads_fp16)),
+                    scale)
+            self._grads_fp16 = []
+        if len(self._grads_fp32) > 0:
+            self._overflow_buf.zero_()
+            multi_tensor_applier(
+                    amp_C.multi_tensor_scale,
+                    self._overflow_buf,
+                    list(zip(*self._grads_fp32)),
+                    scale)
+            self._grads_fp32 = []
+
+    def _do_overlapped_reduction(self, param_i, param_grads_size, param_offset, param):
+        self._init_everything()
+        if not self._is_accumulation_step:
+            # handle overlapped reductions
+            if param.dtype == torch.float16:
+                self._grads_fp16.append( (param.grad, self._individual_flat_grads[param_i]) )
+            else:
+                self._grads_fp32.append( (param.grad, self._individual_flat_grads[param_i]) )
+            self._grads_generated[param_i]=True
+            if self._overlap_reductions and not self._last_step:
+                flush_block = self._get_flush_block()
+                while flush_block:
+                    block_id = flush_block[0] // self._block_size
+                    self._pipeline_block_reductions(block_id)
+                    flush_block = self._get_flush_block()
+
+    @property
+    def L2_grad_norm(self):
+            torch.cuda.current_stream().wait_stream(self._l2_grad_norm_st)
+            return self._L2_grad_norm
+
+    def complete_reductions(self):
+        """Complete reductions if full pipeline is not selected or overlap is not allowed.
+        """
+
+        self._init_everything()
+        if self._last_step:
+            # zero out gradients that have not been completed yet
+            for param_i, grad_generated in enumerate(self._grads_generated):
+                if not grad_generated:
+                    grad_info = self._grads_info[param_i]
+                    param_offset = grad_info["param_offset"]
+                    param_size = grad_info["param_grads_size"]
+                    self._flat_grads[param_offset:param_offset+param_size].zero_()
+                    self._grads_generated[param_i] = True
+
+        if self._last_step or not self._overlap_reductions:
+            # nothing done so far, run full pipeline after reductions
+            for block_id in range(self._num_blocks-1,-1,-1):
+                self._pipeline_block_reductions(block_id)
+
+        torch.cuda.current_stream().wait_stream(self._l2_grad_norm_st)
+
+        self._current_block = self._num_blocks
+        self._grads_generated = [False]*len(self._grads_info)
+
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        # assume same step across group now to simplify things
+        # per parameter step can be easily support by making it tensor, or pass list into kernel
+        for param_group in self.param_groups:
+            if 'step' in param_group:
+                param_group['step'] += 1
+            else:
+                param_group['step'] = 1
+
+        self._pipeline_step()
+
+        with torch.cuda.stream(self._completion_st):
+            # Copy self._new_params to model params
+            self._overflow_buf.zero_()
+            with torch.no_grad():
+                if self._packed_flat_to_model_params_fp16 is not None:
+                    multi_tensor_applier(
+                            fused_adam_cuda.maybe_cast_mt,
+                            self._overflow_buf,
+                            self._packed_flat_to_model_params_fp16)
+                if self._packed_flat_to_model_params_fp32 is not None:
+                    multi_tensor_applier(
+                            fused_adam_cuda.maybe_cast_mt,
+                            self._overflow_buf,
+                            self._packed_flat_to_model_params_fp32)
+
+        torch.cuda.current_stream().wait_stream(self._completion_st)
+
+        self._reductions_works = [None]*self._num_blocks
+        self._allgather_works = [None]*self._num_blocks
+
+        return loss
+
+
--- a/apex/contrib/test/multihead_attn/test_mha_fused_softmax.py
+++ b/apex/contrib/test/multihead_attn/test_mha_fused_softmax.py
+import torch
+import unittest
+import torch.nn.functional as F
+from apex.contrib.multihead_attn import fast_mask_softmax_dropout_func
+
+class FusedSoftmaxTest(unittest.TestCase):
+    def setUp(self, seed=1234):
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+        self.seq_length   = 80
+        self.sequences    = 10
+        self.hidden_dim   = 1024
+        self.heads        = 16
+        self.dropout_prob = 0.0
+
+        self.mask = (torch.randn(self.sequences,self.seq_length)>0).cuda()
+        self.mask = self.mask.half()*-10000
+        self.ref_inputs = torch.randn(self.heads * self.sequences, self.seq_length, self.seq_length, 
+                                      dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
+        
+        self.tst_inputs = self.ref_inputs.clone().detach().requires_grad_(True)
+
+    def test_fused_softmax(self) :
+        grads = torch.randn_like(self.tst_inputs)
+        y_ref = self.ref_inputs.view(self.sequences, self.heads, self.seq_length, self.seq_length)
+        y_ref = y_ref + self.mask.unsqueeze(1).unsqueeze(2)
+        y_ref = y_ref.view(self.sequences*self.heads, self.seq_length, self.seq_length) 
+        y_ref = F.softmax(y_ref, dim=-1)
+        y_ref = torch._fused_dropout(y_ref, 1.0)    
+   
+        y_tst = fast_mask_softmax_dropout_func(True, self.heads, self.tst_inputs, self.mask, True, 0.0)        
+        y_ref[0].backward(grads)
+        y_tst.backward(grads)
+
+        self.assertTrue(torch.allclose(self.ref_inputs,  self.tst_inputs,  atol=1e-5, rtol=1e-5))
+        self.assertTrue(torch.allclose(y_ref[0], y_tst, atol=1e-3, rtol=1e-3))
+        self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/apex/optimizers/fused_lamb.py
+++ b/apex/optimizers/fused_lamb.py
@@ -51,6 +51,8 @@ class FusedLAMB(torch.optim.Optimizer):
            method is called. (default: True)
        max_grad_norm (float, optional): value used to clip global grad norm
            (default: 1.0)
+        use_nvlamb (boolean, optional): Apply adaptive learning rate to 0.0
+            weight decay parameter (default: False)

    .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:
        https://arxiv.org/abs/1904.00962
@@ -62,7 +64,7 @@ class FusedLAMB(torch.optim.Optimizer):
                 betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01,
                 amsgrad=False, adam_w_mode=True,
                 grad_averaging=True, set_grad_none=True,
-                 max_grad_norm=1.0):
+                 max_grad_norm=1.0, use_nvlamb=False):
        if amsgrad:
            raise RuntimeError('FusedLAMB does not support the AMSGrad variant.')
        defaults = dict(lr=lr, bias_correction=bias_correction,
@@ -72,6 +74,7 @@ class FusedLAMB(torch.optim.Optimizer):
        super(FusedLAMB, self).__init__(params, defaults)
        if multi_tensor_applier.available:
            import amp_C
+            self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
            # Skip buffer
            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
            self.multi_tensor_lamb = amp_C.multi_tensor_lamb
@@ -80,6 +83,7 @@ class FusedLAMB(torch.optim.Optimizer):

        self.adam_w_mode = 1 if adam_w_mode else 0
        self.set_grad_none = set_grad_none
+        self.use_nvlamb = use_nvlamb

    def zero_grad(self):
        if self.set_grad_none:
@@ -100,6 +104,37 @@ class FusedLAMB(torch.optim.Optimizer):
        if closure is not None:
            loss = closure()

+        # create separate grad lists for fp32 and fp16 params
+        g_all_32, g_all_16 = [], []
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.dtype == torch.float32:
+                    g_all_32.append(p.grad.data)
+                elif p.dtype == torch.float16:
+                    g_all_16.append(p.grad.data)
+                else:
+                    raise RuntimeError('FusedLAMB only support fp16 and fp32.')
+
+        g_norm_32, g_norm_16 = torch.zeros(1, device='cuda'), torch.zeros(1, device='cuda')
+        # compute grad norm for two lists
+        if len(g_all_32) > 0:
+            g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
+                                             self._dummy_overflow_buf,
+                                             [g_all_32], False)[0]
+        if len(g_all_16) > 0:
+            g_norm_16 = multi_tensor_applier(self.multi_tensor_l2norm,
+                                             self._dummy_overflow_buf,
+                                             [g_all_16], False)[0]
+
+        # blend two grad norms to get global grad norm
+        global_grad_norm = multi_tensor_applier(self.multi_tensor_l2norm,
+                                                self._dummy_overflow_buf,
+                                                [[g_norm_32, g_norm_16]],
+                                                False)[0]
+        max_grad_norm = self.defaults['max_grad_norm']
+
        for group in self.param_groups:
            bias_correction = 1 if group['bias_correction'] else 0
            beta1, beta2 = group['betas']
@@ -156,7 +191,9 @@ class FusedLAMB(torch.optim.Optimizer):
                                     group['weight_decay'],
                                     grad_averaging,
                                     self.adam_w_mode,
-                                     group['max_grad_norm'])
+                                     global_grad_norm,
+                                     max_grad_norm,
+                                     self.use_nvlamb)
            if(len(g_32) > 0):
                multi_tensor_applier(self.multi_tensor_lamb,
                                     self._dummy_overflow_buf,
@@ -170,6 +207,8 @@ class FusedLAMB(torch.optim.Optimizer):
                                     group['weight_decay'],
                                     grad_averaging,
                                     self.adam_w_mode,
-                                     group['max_grad_norm'])
+                                     global_grad_norm,
+                                     max_grad_norm,
+                                     self.use_nvlamb)

        return loss
--- a/apex/pyprof/nvtx/nvmarker.py
+++ b/apex/pyprof/nvtx/nvmarker.py
@@ -204,6 +204,13 @@ def patchClass(cls):
 			add_wrapper(cls, f)

 def init():
+	string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
+			"soon be removed from Apex.  Please visit\n" + \
+			"https://github.com/NVIDIA/PyProf\n" + \
+			"for the latest version.\n\n"
+	# print regardless of warning state
+	print(string)
+
 	print("Initializing NVTX monkey patches")
 	for cls in [torch, torch.Tensor, torch.nn.functional,]:
 		patchClass(cls)

--- a/csrc/amp_C_frontend.cpp
+++ b/csrc/amp_C_frontend.cpp
@@ -42,7 +42,7 @@ void multi_tensor_lamb_stage1_cuda(
    const float beta1,
    const float beta2,
    const float epsilon,
-    const float global_grad_norm,
+    at::Tensor global_grad_norm,
    const float max_global_grad_norm);

 void multi_tensor_lamb_stage2_cuda(
@@ -51,7 +51,9 @@ void multi_tensor_lamb_stage2_cuda(
    std::vector<std::vector<at::Tensor>> tensor_lists,
    at::Tensor per_tensor_param_norm,
    at::Tensor per_tensor_update_norm,
-    const float step_size);
+    const float lr,
+    const float weight_decay,
+    at::optional<bool> use_nvlamb_python);

 void multi_tensor_adam_cuda(
  int chunk_size,
@@ -106,7 +108,9 @@ void multi_tensor_lamb_cuda(
  const float weight_decay,
  const int grad_averaging,
  const int mode,
-  const float max_grad_norm);
+  at::Tensor global_grad_norm,
+  const float max_grad_norm,
+  at::optional<bool> use_nvlamb_python);

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("multi_tensor_scale", &multi_tensor_scale_cuda,

--- a/csrc/multi_tensor_lamb.cu
+++ b/csrc/multi_tensor_lamb.cu
@@ -52,8 +52,8 @@ struct LAMBStage1Functor
    const float epsilon,
    adamMode_t mode,
    const float decay,
-    float* global_grad_norm,
-    float max_global_grad_norm)
+    const float* global_grad_norm,
+    const float max_global_grad_norm)
  {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
@@ -239,7 +239,9 @@ struct LAMBStage2Functor
    TensorListMetadata<2>* tl,
    const float* per_tensor_param_norm,
    const float* per_tensor_update_norm,
-    const float learning_rate)
+    const float learning_rate,
+    const float decay,
+    bool use_nvlamb)
  {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
@@ -250,9 +252,15 @@ struct LAMBStage2Functor
    int chunk_idx = tl->block_to_chunk[blockIdx.x];
    int n = tl->sizes[tensor_loc];

-    float param_norm = per_tensor_param_norm[tensor_num];
-    float update_norm = per_tensor_update_norm[tensor_num];
-    MATH_T ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
+    MATH_T ratio = learning_rate;
+    // nvlamb: apply adaptive learning rate to all parameters
+    // otherwise, only apply to those with non-zero weight decay
+    if (use_nvlamb || (decay != 0.0))
+    {
+      float param_norm = per_tensor_param_norm[tensor_num];
+      float update_norm = per_tensor_update_norm[tensor_num];
+      ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
+    }

    T* update = (T*)tl->addresses[0][tensor_loc];
    update += chunk_idx*chunk_size;
@@ -334,12 +342,16 @@ void multi_tensor_lamb_cuda(
  const float weight_decay,
  const int grad_averaging,
  const int mode,
-  const float max_grad_norm)
+  at::Tensor global_grad_norm,
+  const float max_grad_norm,
+  at::optional<bool> use_nvlamb_python)
 {
  using namespace at;
  // Master weight and 32bit momentum(potentially changing) is not handled by this
  // So we assume every tensor are all in the same type

+  bool use_nvlamb = use_nvlamb_python.has_value() ? use_nvlamb_python.value() : false;
+
  // Handle bias correction mode
  float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
  if (bias_correction == 1) {
@@ -354,9 +366,6 @@ void multi_tensor_lamb_cuda(
  std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin()+1);
  std::vector<std::vector<at::Tensor>> param_list(tensor_lists.begin()+1, tensor_lists.begin()+2);

-  // Compute global grad norm
-  auto grad_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, grad_list, false);
-
  // Compute per tensor param norm
  auto param_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, param_list, true);

@@ -378,7 +387,7 @@ void multi_tensor_lamb_cuda(
        epsilon,
        (adamMode_t) mode,
        weight_decay,
-        std::get<0>(grad_norm_tuple).DATA_PTR<float>(),
+        global_grad_norm.DATA_PTR<float>(),
        max_grad_norm); )

  // Compute update norms
@@ -395,7 +404,9 @@ void multi_tensor_lamb_cuda(
        LAMBStage2Functor<scalar_t_0>(),
        std::get<1>(param_norm_tuple).DATA_PTR<float>(),
        std::get<1>(update_norm_tuple).DATA_PTR<float>(),
-        lr); )
+        lr,
+	weight_decay,
+	use_nvlamb); )

  AT_CUDA_CHECK(cudaGetLastError());


--- a/csrc/multi_tensor_lamb_stage_1.cu
+++ b/csrc/multi_tensor_lamb_stage_1.cu
@@ -118,12 +118,13 @@ void multi_tensor_lamb_stage1_cuda(
  const float beta1,
  const float beta2,
  const float epsilon,
-  const float global_grad_norm,
+  at::Tensor global_grad_norm,
  const float max_global_grad_norm)
 {
  using namespace at;

-  float clipped_global_grad_norm = global_grad_norm > max_global_grad_norm ? global_grad_norm / max_global_grad_norm : 1.0f;
+  const float* g_grad_norm = global_grad_norm.DATA_PTR<float>();
+  float clipped_global_grad_norm = *(g_grad_norm) > max_global_grad_norm ? *(g_grad_norm) / max_global_grad_norm : 1.0f;
  float next_step = float(step+1);
  float beta1_correction = 1.0f - std::pow(beta1, next_step);
  float beta2_correction = 1.0f - std::pow(beta2, next_step);

--- a/csrc/multi_tensor_lamb_stage_2.cu
+++ b/csrc/multi_tensor_lamb_stage_2.cu
@@ -13,6 +13,8 @@
 #define BLOCK_SIZE 512
 #define ILP 4

+using MATH_T = float;
+
 // Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
 // It computes new parameter value.
 template<typename T, typename UPD_T>
@@ -24,7 +26,9 @@ struct LAMBStage2Functor
    TensorListMetadata<2>* tl,
    const float* per_tensor_param_norm,
    const float* per_tensor_update_norm,
-    const float learning_rate)
+    const float learning_rate,
+    const float decay,
+    bool use_nvlamb)
  {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
@@ -35,9 +39,15 @@ struct LAMBStage2Functor
    int chunk_idx = tl->block_to_chunk[blockIdx.x];
    int n = tl->sizes[tensor_loc];

-    float param_norm = per_tensor_param_norm[tensor_num];
-    float update_norm = per_tensor_update_norm[tensor_num];
-    T ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
+    MATH_T ratio = learning_rate;
+    // nvlamb: apply adaptive learning rate to all parameters
+    // otherwise, only apply to those with non-zero weight decay
+    if (use_nvlamb || (decay != 0.0))
+    {
+      float param_norm = per_tensor_param_norm[tensor_num];
+      float update_norm = per_tensor_update_norm[tensor_num];
+      ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
+    }

    T* p = (T*)tl->addresses[0][tensor_loc];
    p += chunk_idx*chunk_size;
@@ -87,8 +97,12 @@ void multi_tensor_lamb_stage2_cuda(
  std::vector<std::vector<at::Tensor>> tensor_lists,
  at::Tensor per_tensor_param_norm,
  at::Tensor per_tensor_update_norm,
-  const float learning_rate)
+  const float lr,
+  const float weight_decay,
+  at::optional<bool> use_nvlamb_python)
 {
+  bool use_nvlamb = use_nvlamb_python.has_value() ? use_nvlamb_python.value() : false;
+
  using namespace at;

  DISPATCH_FLOAT_AND_HALF_AND_BFLOAT16(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
@@ -101,7 +115,9 @@ void multi_tensor_lamb_stage2_cuda(
        LAMBStage2Functor<scalar_t_0, scalar_t_1>(),
        per_tensor_param_norm.DATA_PTR<float>(),
        per_tensor_update_norm.DATA_PTR<float>(),
-        learning_rate); ))
+        lr,
+	weight_decay,
+	use_nvlamb); ))

  AT_CUDA_CHECK(cudaGetLastError());


--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@ if not torch.cuda.is_available():
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
        os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"

-print("torch.__version__  = ", torch.__version__)
+print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
 TORCH_MINOR = int(torch.__version__.split('.')[1])

@@ -37,6 +37,11 @@ ext_modules = []

 extras = {}
 if "--pyprof" in sys.argv:
+    string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
+             "soon be removed from Apex.  Please visit\n" + \
+             "https://github.com/NVIDIA/PyProf\n" + \
+             "for the latest version."
+    warnings.warn(string, DeprecationWarning)
    with open('requirements.txt') as f:
        required_packages = f.read().splitlines()
        extras['pyprof'] = required_packages
@@ -98,6 +103,25 @@ if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
    version_ge_1_5 = ['-DVERSION_GE_1_5']
 version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5

+if "--distributed_lamb" in sys.argv:
+    from torch.utils.cpp_extension import CUDAExtension
+    sys.argv.remove("--distributed_lamb")
+
+    from torch.utils.cpp_extension import BuildExtension
+    cmdclass['build_ext'] = BuildExtension
+
+    if torch.utils.cpp_extension.CUDA_HOME is None:
+        raise RuntimeError("--distributed_lamb was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
+    else:
+        ext_modules.append(
+            CUDAExtension(name='distributed_lamb_cuda',
+                          sources=['apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp',
+                                   'apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu'],
+                          include_dirs=[os.path.join(this_dir, 'csrc')],
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
+                                              'nvcc':['-O3',
+                                                      '--use_fast_math'] + version_dependent_macros}))
+
 if "--cuda_ext" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--cuda_ext")
@@ -293,6 +317,58 @@ if "--fast_multihead_attn" in sys.argv:
        raise RuntimeError("--fast_multihead_attn was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
    else:
        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/multihead_attn/cutlass"])
+        ext_modules.append(
+            CUDAExtension(name='fast_additive_mask_softmax_dropout',
+                          sources=['apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp',
+                                   'apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu'],
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
+                                              'nvcc':['-O3',
+                                                      '-gencode', 'arch=compute_70,code=sm_70',
+                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
+                                                      '-U__CUDA_NO_HALF_OPERATORS__',
+                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
+                                                      '--expt-relaxed-constexpr',
+                                                      '--expt-extended-lambda',
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+        ext_modules.append(
+            CUDAExtension(name='fast_mask_softmax_dropout',
+                          sources=['apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp',
+                                   'apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu'],
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
+                                              'nvcc':['-O3',
+                                                      '-gencode', 'arch=compute_70,code=sm_70',
+                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
+                                                      '-U__CUDA_NO_HALF_OPERATORS__',
+                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
+                                                      '--expt-relaxed-constexpr',
+                                                      '--expt-extended-lambda',
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+        ext_modules.append(
+            CUDAExtension(name='fast_self_multihead_attn_bias_additive_mask',
+                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp',
+                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu'],
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
+                                              'nvcc':['-O3',
+                                                      '-gencode', 'arch=compute_70,code=sm_70',
+                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
+                                                      '-U__CUDA_NO_HALF_OPERATORS__',
+                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
+                                                      '--expt-relaxed-constexpr',
+                                                      '--expt-extended-lambda',
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+        ext_modules.append(
+            CUDAExtension(name='fast_self_multihead_attn_bias',
+                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp',
+                                   'apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu'],
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag,
+                                              'nvcc':['-O3',
+                                                      '-gencode', 'arch=compute_70,code=sm_70',
+                                                      '-I./apex/contrib/csrc/multihead_attn/cutlass/',
+                                                      '-U__CUDA_NO_HALF_OPERATORS__',
+                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
+                                                      '--expt-relaxed-constexpr',
+                                                      '--expt-extended-lambda',
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn',
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp',