Merge branch 'main' into checkpoint_util

80389ef6 · Jared Casper · 1b2db724 · d07d29df · 80389ef6 · 80389ef6
Commit 80389ef6 authored Jun 24, 2022 by Jared Casper
20 changed files
--- a/megatron/model/gpt_model.py
+++ b/megatron/model/gpt_model.py
@@ -32,20 +32,26 @@ def post_language_model_processing(lm_output, labels, logit_weights,
                                   parallel_output,
                                   fp16_lm_cross_entropy):
-    # Output.
+    # Output. Format [s b h]
    output = parallel_lm_logits(
        lm_output,
        logit_weights,
        parallel_output)
    if labels is None:
-        return output
+        # [s b h] => [b s h]
+        return output.transpose(0,1).contiguous()
    else:
+        # [b s] => [s b]
+        labels = labels.transpose(0,1).contiguous()
        if fp16_lm_cross_entropy:
            assert output.dtype == torch.half
            loss = mpu.vocab_parallel_cross_entropy(output, labels)
        else:
            loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
+        # [s b] => [b, s]
+        loss = loss.transpose(0,1).contiguous()
        return loss

--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -26,17 +26,29 @@ from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal, scaled_init_method_normal
 def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
                       bias=None):
    """LM logits using word embedding weights."""
+    args = get_args()
    # Parallel logits.
-    input_parallel = mpu.copy_to_tensor_model_parallel_region(input_)
+    if args.async_tensor_model_parallel_allreduce or\
-    # Matrix multiply.
+            args.sequence_parallel:
-    if bias is None:
+        input_parallel = input_
-        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
+        model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+        async_grad_allreduce = args.async_tensor_model_parallel_allreduce and \
+            model_parallel and not args.sequence_parallel
    else:
-        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
+        input_parallel = mpu.copy_to_tensor_model_parallel_region(input_)
+        async_grad_allreduce = False
+    # Matrix multiply.
+    logits_parallel = mpu.LinearWithGradAccumulationAndAsyncCommunication.apply(
+        input_parallel, word_embeddings_weight, bias,
+        args.gradient_accumulation_fusion,
+        async_grad_allreduce, args.sequence_parallel)
    # Gather if needed.
    if parallel_output:
        return logits_parallel
@@ -92,12 +104,23 @@ class Pooler(MegatronModule):
    def __init__(self, hidden_size, init_method):
        super(Pooler, self).__init__()
+        args = get_args()
        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+        self.sequence_parallel = args.sequence_parallel
    def forward(self, hidden_states, sequence_index=0):
-        # hidden_states: [b, s, h]
+        # hidden_states: [s, b, h]
        # sequence_index: index of the token to pool.
-        pooled = hidden_states[:, sequence_index, :]
+        # gather data along sequence dimensions
+        # same pooler is run on all tensor parallel nodes
+        if self.sequence_parallel:
+            hidden_states = mpu.gather_from_sequence_parallel_region(
+                hidden_states,
+                tensor_parallel_output_grad=False)
+        pooled = hidden_states[sequence_index, :, :]
        pooled = self.dense(pooled)
        pooled = torch.tanh(pooled)
        return pooled
@@ -160,6 +183,8 @@ class Embedding(MegatronModule):
        else:
            self.tokentype_embeddings = None
+        self.fp32_residual_connection = args.fp32_residual_connection 
+        self.sequence_parallel = args.sequence_parallel
        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
@@ -201,8 +226,20 @@ class Embedding(MegatronModule):
        else:
            assert self.tokentype_embeddings is None
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
        # Dropout.
-        embeddings = self.embedding_dropout(embeddings)
+        if self.sequence_parallel:
+            embeddings = mpu.scatter_to_sequence_parallel_region(embeddings)
+            with mpu.get_cuda_rng_tracker().fork():
+                embeddings = self.embedding_dropout(embeddings)
+        else:
+            embeddings = self.embedding_dropout(embeddings)
        return embeddings

--- a/megatron/model/t5_model.py
+++ b/megatron/model/t5_model.py
@@ -152,19 +152,24 @@ class T5Model(MegatronModule):
        if self.post_process and self.add_decoder:
            decoder_output, encoder_output = lm_output
-            # Output.
+            # Output. [s, b, h]
            lm_logits = self.lm_head(decoder_output,
                                     self.word_embeddings_weight())
            if lm_labels is None:
-                return lm_logits
+                # [s b h] => [b s h]
+                return lm_logits.transpose(0,1).contiguous()
            else:
+                # [b s] => [s b]
+                lm_labels = lm_labels.transpose(0,1).contiguous()
                if self.fp16_lm_cross_entropy:
                    assert lm_logits.dtype == torch.half
                    lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
                else:
                    lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
                                                               lm_labels)
+                # [s b] => [b s]
+                lm_loss = lm_loss.transpose(0,1).contiguous()
            return lm_loss
        elif self.add_decoder and not self.add_encoder:
            decoder_output, encoder_output = lm_output

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -15,10 +15,11 @@
 """Transformer."""
 import math
+from contextlib import nullcontext
 import torch
 import torch.nn.functional as F
-from megatron import get_args
+from megatron import get_timers, get_args, get_global_memory_buffer
 from megatron import mpu
 from .module import MegatronModule
 from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType
@@ -27,6 +28,7 @@ from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl
 from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
 """ We use the following notation throughout this file:
     h: hidden size
     n: number of attention heads
@@ -42,7 +44,6 @@ from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
        hyperparameters: transformer hyperparameters
 """
 class DropPath(MegatronModule):
    """Drop paths (Stochastic Depth) per sample 
    (when applied in main path of residual blocks).
@@ -116,11 +117,196 @@ class ParallelMLP(MegatronModule):
        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
        return output, output_bias
+class SwitchMLP(MegatronModule):
+    """
+    Routes input to one of N MLP "experts"
+    """
+    def __init__(self, init_method, output_layer_init_method):
+        super(SwitchMLP, self).__init__()
+        args = get_args()
+        self.router = torch.nn.Linear(args.hidden_size, args.num_experts)
+        self.experts = torch.nn.ModuleList()
+        for i in range(args.num_experts):
+            self.experts.append(ParallelMLP(init_method, output_layer_init_method))
+    def forward(self, hidden_states):
+        # hidden_states: [s, b, h]
+        s = hidden_states.size(0)
+        b = hidden_states.size(1)
+        h = hidden_states.size(2)
+        route = self.router(hidden_states)
+        route = torch.nn.functional.softmax(route, dim=2)
+        max_prob, max_ind = torch.max(route, dim=2)
+        max_prob = torch.unsqueeze(max_prob, 2) # [s b 1]
+        # TODO (rprenger) TODO this could be made easier to read
+        # Converting [s, b, h] to [s*b, h].
+        # Each vector could be routed differently
+        hidden_states = hidden_states.view(-1, hidden_states.size(2)) # [s*b h]
+        max_prob = max_prob.view(-1, max_prob.size(2)) # [s*b 1]
+        max_ind = max_ind.view(-1) # [s*b]
+        output_total = torch.empty_like(hidden_states)
+        output_bias_total = torch.empty_like(hidden_states)
+        #TODO (rprenger) This does each expert in serial, but it could be parallelized
+        for expert_num, expert in enumerate(self.experts):
+            local_indices = (max_ind == expert_num).nonzero()
+            hidden = hidden_states[local_indices,:]
+            output, output_bias = expert(hidden)
+            output_bias = output_bias.expand_as(output)
+            output_total[local_indices,:] = output
+            output_bias_total[local_indices,:] = output_bias
+        output_total = output_total*max_prob
+        output_bias_total = output_bias_total*max_prob
+        output_total = output_total.view(s, b, h)
+        output_bias_total = output_bias_total.view(s, b, h)
+        return output_total, output_bias_total
+class CoreAttention(MegatronModule):
+    def __init__(self, layer_number,
+                 attn_mask_type=AttnMaskType.padding):
+        super(CoreAttention, self).__init__()
+        args = get_args()
+        self.fp16 = args.fp16
+        self.bf16 = args.bf16
+        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.attn_mask_type = attn_mask_type
+        self.sequence_parallel = args.sequence_parallel
+        projection_size = args.kv_channels * args.num_attention_heads
+        # Per attention head and per partition values.
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(projection_size,
+                                                    world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            projection_size, args.num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
+            args.num_attention_heads, world_size)
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            self.fp16, self.bf16,
+            self.attn_mask_type,
+            args.masked_softmax_fusion,
+            attention_mask_func,
+            self.attention_softmax_in_fp32,
+            coeff)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
+    def forward(self, query_layer, key_layer,
+                value_layer, attention_mask):
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1),
+                       query_layer.size(2),
+                       query_layer.size(0),
+                       key_layer.size(0))
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(output_size[2],
+                                       output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = get_global_memory_buffer().get_tensor(
+            (output_size[0]*output_size[1], output_size[2], output_size[3]),
+            query_layer.dtype, "mpu")
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer.transpose(0, 1),   # [b * np, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0, alpha=(1.0/self.norm_factor))
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        if not self.sequence_parallel:
+            with mpu.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1),
+                       value_layer.size(2),
+                       query_layer.size(0),
+                       value_layer.size(3))
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(value_layer.size(0),
+                                       output_size[0] * output_size[1], -1)
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
 class ParallelAttention(MegatronModule):
    """Parallel self-attention layer abstract class.
-    Self-attention layer takes input with size [b, s, h]
+    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    """
@@ -130,13 +316,6 @@ class ParallelAttention(MegatronModule):
                 attn_mask_type=AttnMaskType.padding):
        super(ParallelAttention, self).__init__()
        args = get_args()
-        self.fp16 = args.fp16
-        self.bf16 = args.bf16
-        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
        self.layer_number = max(1, layer_number)
        self.attention_type = attention_type
        self.attn_mask_type = attn_mask_type
@@ -146,8 +325,6 @@ class ParallelAttention(MegatronModule):
        # Per attention head and per partition values.
        world_size = mpu.get_tensor_model_parallel_world_size()
-        self.hidden_size_per_partition = mpu.divide(projection_size,
-                                                    world_size)
        self.hidden_size_per_attention_head = mpu.divide(
            projection_size, args.num_attention_heads)
        self.num_attention_heads_per_partition = mpu.divide(
@@ -174,24 +351,9 @@ class ParallelAttention(MegatronModule):
                gather_output=False,
                init_method=init_method)
-        coeff = None
+        self.core_attention = CoreAttention(self.layer_number,
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+                                            self.attn_mask_type)
-        if self.apply_query_key_layer_scaling:
+        self.checkpoint_core_attention = args.recompute_granularity == 'selective'
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-        self.scale_mask_softmax = FusedScaleMaskSoftmax(
-            self.fp16, self.bf16,
-            self.attn_mask_type,
-            args.masked_softmax_fusion,
-            attention_mask_func,
-            self.attention_softmax_in_fp32,
-            coeff)
-        # Dropout. Note that for a single iteration, this layer will generate
-        # different outputs on different number of parallel partitions but
-        # on average it should not be partition dependent.
-        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
        # Output.
        self.dense = mpu.RowParallelLinear(
@@ -201,6 +363,23 @@ class ParallelAttention(MegatronModule):
            init_method=output_layer_init_method,
            skip_bias_add=True)
+    def _checkpointed_attention_forward(self, query_layer, key_layer,
+                                        value_layer, attention_mask):
+        """Forward method with activation checkpointing."""
+        def custom_forward(*inputs):
+            query_layer = inputs[0]
+            key_layer = inputs[1]
+            value_layer = inputs[2]
+            attention_mask = inputs[3]
+            output_ = self.core_attention(query_layer, key_layer,
+                                          value_layer, attention_mask)
+            return output_
+        hidden_states = mpu.checkpoint(
+            custom_forward,
+            False, query_layer, key_layer, value_layer, attention_mask)
+        return hidden_states
    def _allocate_memory(self, inference_max_sequence_len, batch_size):
        return torch.empty(
@@ -210,13 +389,11 @@ class ParallelAttention(MegatronModule):
            self.hidden_size_per_attention_head,
            dtype=self.params_dtype,
            device=torch.cuda.current_device())
    def forward(self, hidden_states, attention_mask,
                encoder_output=None, inference_params=None):
        # hidden_states: [sq, b, h]
        # =================================================
        # Pre-allocate memory for key-values for inference.
        # =================================================
@@ -234,7 +411,6 @@ class ParallelAttention(MegatronModule):
                inference_key_memory, inference_value_memory = \
                    inference_params.key_value_memory_dict[self.layer_number]
        # =====================
        # Query, Key, and Value
        # =====================
@@ -275,7 +451,6 @@ class ParallelAttention(MegatronModule):
                 self.hidden_size_per_attention_head)
            query_layer = query_layer.view(*new_tensor_shape)
        # ==================================
        # Adjust key and value for inference
        # ==================================
@@ -297,90 +472,16 @@ class ParallelAttention(MegatronModule):
            value_layer = inference_value_memory[
                :sequence_end, batch_start:batch_end, ...]
+        # ==================================
+        # core attention computation
+        # ==================================
-        # ===================================
+        if self.checkpoint_core_attention:
-        # Raw attention scores. [b, np, s, s]
+            context_layer = self._checkpointed_attention_forward(
-        # ===================================
+                query_layer, key_layer, value_layer, attention_mask)
+        else:
-        # [b, np, sq, sk]
+            context_layer = self.core_attention(
-        output_size = (query_layer.size(1),
+                query_layer, key_layer, value_layer, attention_mask)
-                       query_layer.size(2),
-                       query_layer.size(0),
-                       key_layer.size(0))
-        # [sq, b, np, hn] -> [sq, b * np, hn]
-        query_layer = query_layer.view(output_size[2],
-                                       output_size[0] * output_size[1], -1)
-        # [sk, b, np, hn] -> [sk, b * np, hn]
-        key_layer = key_layer.view(output_size[3],
-                                   output_size[0] * output_size[1], -1)
-        # preallocting result tensor: [b * np, sq, sk]
-        matmul_result = torch.empty(
-            output_size[0]*output_size[1],
-            output_size[2],
-            output_size[3],
-            dtype=query_layer.dtype,
-            device=torch.cuda.current_device())
-        # Raw attention scores. [b * np, sq, sk]
-        matmul_result = torch.baddbmm(
-            matmul_result,
-            query_layer.transpose(0, 1),   # [b * np, sq, hn]
-            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-            beta=0.0, alpha=(1.0/self.norm_factor))
-        # change view to [b, np, sq, sk]
-        attention_scores = matmul_result.view(*output_size)
-        # ===========================
-        # Attention probs and dropout
-        # ===========================
-        # attention scores and attention mask [b, np, sq, sk]
-        attention_probs = self.scale_mask_softmax(attention_scores,
-                                                  attention_mask)
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        with mpu.get_cuda_rng_tracker().fork():
-            attention_probs = self.attention_dropout(attention_probs)
-        # =========================
-        # Context layer. [sq, b, hp]
-        # =========================
-        # value_layer -> context layer.
-        # [sk, b, np, hn] --> [b, np, sq, hn]
-        # context layer shape: [b, np, sq, hn]
-        output_size = (value_layer.size(1),
-                       value_layer.size(2),
-                       query_layer.size(0),
-                       value_layer.size(3))
-        # change view [sk, b * np, hn]
-        value_layer = value_layer.view(value_layer.size(0),
-                                       output_size[0] * output_size[1], -1)
-        # change view [b * np, sq, sk]
-        attention_probs = attention_probs.view(output_size[0] * output_size[1],
-                                               output_size[2], -1)
-        # matmul: [b * np, sq, hn]
-        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-        # change view [b, np, sq, hn]
-        context_layer = context_layer.view(*output_size)
-        # [b, np, sq, hn] --> [sq, b, np, hn]
-        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-        # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_layer_shape = context_layer.size()[:-2] + \
-            (self.hidden_size_per_partition,)
-        context_layer = context_layer.view(*new_context_layer_shape)
        # =================
        # Output. [sq, b, h]
@@ -423,7 +524,7 @@ def bias_dropout_add_fused_inference(x: torch.Tensor,
 class ParallelTransformerLayer(MegatronModule):
    """A single transformer layer.
-    Transformer layer takes input with size [b, s, h] and returns an
+    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    """
@@ -447,7 +548,8 @@ class ParallelTransformerLayer(MegatronModule):
        self.input_layernorm = LayerNorm(
            args.hidden_size,
            eps=args.layernorm_epsilon,
-            no_persist_layer_norm=args.no_persist_layer_norm)
+            no_persist_layer_norm=args.no_persist_layer_norm,
+            sequence_parallel=args.sequence_parallel)
        # Self attention.
        self.self_attention = ParallelAttention(
@@ -464,7 +566,8 @@ class ParallelTransformerLayer(MegatronModule):
        self.post_attention_layernorm = LayerNorm(
            args.hidden_size,
            eps=args.layernorm_epsilon,
-            no_persist_layer_norm=args.no_persist_layer_norm)
+            no_persist_layer_norm=args.no_persist_layer_norm,
+            sequence_parallel=args.sequence_parallel)
        if self.layer_type == LayerType.decoder:
            self.inter_attention = ParallelAttention(
@@ -476,16 +579,26 @@ class ParallelTransformerLayer(MegatronModule):
            self.post_inter_attention_layernorm = LayerNorm(
                args.hidden_size,
                eps=args.layernorm_epsilon,
-                no_persist_layer_norm=args.no_persist_layer_norm)
+                no_persist_layer_norm=args.no_persist_layer_norm,
+                sequence_parallel=args.sequence_parallel)
        # MLP
-        self.mlp = ParallelMLP(init_method,
+        if args.num_experts is not None:
-                               output_layer_init_method)
+            self.mlp = SwitchMLP(init_method, output_layer_init_method)
+        else:
+            self.mlp = ParallelMLP(init_method, output_layer_init_method)
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+        self.bias_dropout_add_exec_handler = \
+                nullcontext if use_nvfuser else torch.enable_grad
    def forward(self, hidden_states, attention_mask,
                encoder_output=None, enc_dec_attn_mask=None,
                inference_params=None):
-        # hidden_states: [b, s, h]
+        # hidden_states: [s, b, h]
        # Layer norm at the beginning of the transformer layer.
        layernorm_output = self.input_layernorm(hidden_states)
@@ -515,8 +628,7 @@ class ParallelTransformerLayer(MegatronModule):
            else:
                bias_dropout_add_func = get_bias_dropout_add(self.training)
-            # re-enable torch grad to enable fused optimization.
+            with self.bias_dropout_add_exec_handler():
-            with torch.enable_grad():
                layernorm_input = bias_dropout_add_func(
                    attention_output,
                    attention_bias.expand_as(residual),
@@ -542,8 +654,7 @@ class ParallelTransformerLayer(MegatronModule):
            else:
                residual = layernorm_input
-            # re-enable torch grad to enable fused optimization.
+            with self.bias_dropout_add_exec_handler():
-            with torch.enable_grad():
                layernorm_input = bias_dropout_add_func(
                    attention_output,
                    attention_bias.expand_as(residual),
@@ -563,13 +674,23 @@ class ParallelTransformerLayer(MegatronModule):
            residual = layernorm_input
        if self.drop_path is None:
-            # re-enable torch grad to enable fused optimization.
+            with self.bias_dropout_add_exec_handler():
-            with torch.enable_grad():
                output = bias_dropout_add_func(
                    mlp_output,
                    mlp_bias.expand_as(residual),
                    residual,
                    self.hidden_dropout)
+            # Jit compiled function creates 'view' tensor. This tensor
+            # potentially gets saved in the MPU checkpoint function context,
+            # which rejects view tensors. While making a viewless tensor here
+            # won't result in memory savings (like the data loader, or
+            # p2p_communication), it serves to document the origin of this
+            # 'view' tensor.
+            output = mpu.make_viewless_tensor(inp = output,
+                                              requires_grad = output.requires_grad,
+                                              keep_graph = True)
        else:
            out = torch.nn.functional.dropout(mlp_output + mlp_bias,
                                              p=self.hidden_dropout,
@@ -611,22 +732,30 @@ class ParallelTransformer(MegatronModule):
    def __init__(self, init_method, output_layer_init_method,
                 layer_type=LayerType.encoder,
                 self_attn_mask_type=AttnMaskType.padding,
+                 post_layer_norm=True, 
                 pre_process=True, post_process=True,
                 drop_path_rate=0.0):
        super(ParallelTransformer, self).__init__()
        args = get_args()
+        self.layer_type = layer_type
+        self.model_type = args.model_type
        self.bf16 = args.bf16
        self.fp32_residual_connection = args.fp32_residual_connection
+        self.post_layer_norm = post_layer_norm
        self.pre_process = pre_process
        self.post_process = post_process
        self.input_tensor = None
        self.drop_path_rate = drop_path_rate
        # Store activation checkpoiting flag.
-        self.activations_checkpoint_method = args.activations_checkpoint_method
+        self.recompute_granularity = args.recompute_granularity
-        self.activations_checkpoint_num_layers = args.activations_checkpoint_num_layers
+        self.recompute_method = args.recompute_method
-        self.distribute_checkpointed_activations = args.distribute_checkpointed_activations
+        self.recompute_num_layers = args.recompute_num_layers
+        self.distribute_saved_activations = \
+            args.distribute_saved_activations and not args.sequence_parallel
+        self.sequence_parallel = args.sequence_parallel
        # Number of layers.
        self.num_layers = mpu.get_num_layers(
@@ -690,12 +819,13 @@ class ParallelTransformer(MegatronModule):
            self.layers = torch.nn.ModuleList(
                [build_layer(i + 1 + offset) for i in range(self.num_layers)])
-        if self.post_process:
+        if self.post_process and self.post_layer_norm:
            # Final layer norm before output.
            self.final_layernorm = LayerNorm(
                args.hidden_size,
                eps=args.layernorm_epsilon,
-                no_persist_layer_norm=args.no_persist_layer_norm)
+                no_persist_layer_norm=args.no_persist_layer_norm,
+                sequence_parallel=args.sequence_parallel)
    def _get_layer(self, layer_number):
        return self.layers[layer_number]
@@ -715,32 +845,33 @@ class ParallelTransformer(MegatronModule):
                return x_
            return custom_forward
-        if self.activations_checkpoint_method == 'uniform':
+        if self.recompute_method == 'uniform':
            # Uniformly divide the total number of Transformer layers and checkpoint
            # the input activation of each divided chunk.
            # A method to further reduce memory usage reducing checkpoints.
            l = 0
            while l < self.num_layers:
                hidden_states = mpu.checkpoint(
-                    custom(l, l + self.activations_checkpoint_num_layers),
+                    custom(l, l + self.recompute_num_layers),
-                    self.distribute_checkpointed_activations,
+                    self.distribute_saved_activations,
                    hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
-                l += self.activations_checkpoint_num_layers
+                l += self.recompute_num_layers
-        elif self.activations_checkpoint_method == 'block':
+        elif self.recompute_method == 'block':
            # Checkpoint the input activation of only a set number of individual
            # Transformer layers and skip the rest.
            # A method fully use the device memory removing redundant re-computation.
            for l in range(self.num_layers):
-                if l < self.activations_checkpoint_num_layers:
+                if l < self.recompute_num_layers:
                    hidden_states = mpu.checkpoint(
                        custom(l, l + 1),
-                        self.distribute_checkpointed_activations,
+                        self.distribute_saved_activations,
                        hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
                else:
                    hidden_states = custom(l, l + 1)(
                        hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
        else:
-            raise ValueError("Invalid activation checkpoint method.")
+            raise ValueError("Invalid activation recompute method.")
        return hidden_states
@@ -757,21 +888,14 @@ class ParallelTransformer(MegatronModule):
    def forward(self, hidden_states, attention_mask,
                encoder_output=None, enc_dec_attn_mask=None,
                inference_params=None):
+        # hidden_states: [s, b, h]
        # Checks.
        if inference_params:
-            assert self.activations_checkpoint_method is None, \
+            assert self.recompute_granularity is None, \
                'inference does not work with activation checkpointing'
-        if self.pre_process:
+        if not self.pre_process:
-            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-            # If the input flag for fp32 residual connection is set, convert for float.
-            if self.fp32_residual_connection:
-                hidden_states = hidden_states.transpose(0, 1).contiguous().float()
-            # Otherwise, leave it as is.
-            else:
-                hidden_states = hidden_states.transpose(0, 1).contiguous()
-        else:
            # See set_input_tensor()
            hidden_states = self.input_tensor
@@ -792,37 +916,34 @@ class ParallelTransformer(MegatronModule):
        #   is called here to be future-proof and corner-case-proof.
        hidden_states = mpu.make_viewless_tensor(
            hidden_states,
-            requires_grad = True,
+            requires_grad=True,
-            keep_graph = True,
+            keep_graph=True,
        )
-        # Transpose encoder output.
+        if self.sequence_parallel:
-        if encoder_output is not None:
+            rng_context = mpu.get_cuda_rng_tracker().fork()
-            encoder_output = encoder_output.transpose(0, 1).contiguous()
-        # Forward pass.
-        if self.activations_checkpoint_method is not None:
-            hidden_states = self._checkpointed_forward(hidden_states,
-                                                       attention_mask,
-                                                       encoder_output,
-                                                       enc_dec_attn_mask)
        else:
-            for index in range(self.num_layers):
+            rng_context = nullcontext()
-                layer = self._get_layer(index)
-                hidden_states = layer(
+        with rng_context:
-                    hidden_states,
+            # Forward pass.
-                    attention_mask,
+            if self.recompute_granularity == 'full':
-                    encoder_output=encoder_output,
+                hidden_states = self._checkpointed_forward(hidden_states,
-                    enc_dec_attn_mask=enc_dec_attn_mask,
+                                                           attention_mask,
-                    inference_params=inference_params)
+                                                           encoder_output,
+                                                           enc_dec_attn_mask)
+            else:
+                for index in range(self.num_layers):
+                    layer = self._get_layer(index)
+                    hidden_states = layer(
+                        hidden_states,
+                        attention_mask,
+                        encoder_output=encoder_output,
+                        enc_dec_attn_mask=enc_dec_attn_mask,
+                        inference_params=inference_params)
        # Final layer norm.
-        if self.post_process:
+        if self.post_process and self.post_layer_norm:
-            # Reverting data format change [s b h] --> [b s h].
+            hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.transpose(0, 1).contiguous()
-            output = self.final_layernorm(hidden_states)
-        else:
-            output = hidden_states
-        return output
+        return hidden_states
--- a/megatron/model/vision/vit_backbone.py
+++ b/megatron/model/vision/vit_backbone.py
@@ -21,7 +21,6 @@ import torch
 import apex
 import torch.nn.functional as F
 from megatron import get_args
-from megatron.model import LayerNorm
 from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import (
    get_linear_layer,
@@ -148,6 +147,7 @@ class VitBackbone(MegatronModule):
                 post_process=True,
                 class_token=True,
                 single_token_output=False,
+                 post_layer_norm=True,
                 drop_path_rate=0.0):
        super(VitBackbone, self).__init__(share_word_embeddings=False)
        args = get_args()
@@ -165,6 +165,7 @@ class VitBackbone(MegatronModule):
        self.pre_process = pre_process
        self.post_process = post_process
        self.class_token = class_token
+        self.post_layer_norm = post_layer_norm
        self.hidden_size = args.hidden_size
        self.patch_dim = args.patch_dim
        self.img_h = args.img_h
@@ -218,6 +219,7 @@ class VitBackbone(MegatronModule):
            self.scaled_init_method,
            pre_process=self.pre_process,
            post_process=self.post_process,
+            post_layer_norm=self.post_layer_norm,
            drop_path_rate=self.drop_path_rate
        )

--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -49,17 +49,21 @@ from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pi
 from .initialize import initialize_model_parallel
 from .initialize import model_parallel_is_initialized
+from .layers import LinearWithGradAccumulationAndAsyncCommunication
 from .layers import ColumnParallelLinear
 from .layers import RowParallelLinear
 from .layers import VocabParallelEmbedding
 from .layers import (set_tensor_model_parallel_attributes,
                     set_defaults_if_not_set_tensor_model_parallel_attributes,
                     copy_tensor_model_parallel_attributes)
-from .mappings import copy_to_tensor_model_parallel_region
+from .mappings import  copy_to_tensor_model_parallel_region
-from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import  reduce_from_tensor_model_parallel_region
-from .mappings import reduce_from_tensor_model_parallel_region
+from .mappings import  scatter_to_tensor_model_parallel_region
-from .mappings import scatter_to_tensor_model_parallel_region
+from .mappings import  gather_from_tensor_model_parallel_region
+from .mappings import  scatter_to_sequence_parallel_region
+from .mappings import  gather_from_sequence_parallel_region
+from .mappings import  reduce_scatter_to_sequence_parallel_region
 from .random import checkpoint
 from .random import get_cuda_rng_tracker

--- a/megatron/mpu/initialize.py
+++ b/megatron/mpu/initialize.py
@@ -54,6 +54,12 @@ _POSITION_EMBEDDING_GLOBAL_RANKS = None
 # rank when broadcasting from the first or last pipeline stage.
 _PIPELINE_GLOBAL_RANKS = None
+# A list of global ranks for each data parallel group to ease calculation of the source
+# rank when broadcasting weights from src to all other data parallel ranks
+_DATA_PARALLEL_GLOBAL_RANKS = None
 def is_unitialized():
    """Useful for code segments that may be accessed with or without mpu initialization"""
    return _DATA_PARALLEL_GROUP is None
@@ -124,6 +130,7 @@ def initialize_model_parallel(tensor_model_parallel_size_=1,
    # Build the data-parallel groups.
    global _DATA_PARALLEL_GROUP
+    global _DATA_PARALLEL_GLOBAL_RANKS
    assert _DATA_PARALLEL_GROUP is None, \
        'data parallel group is already initialized'
    all_data_parallel_group_ranks = []
@@ -137,6 +144,7 @@ def initialize_model_parallel(tensor_model_parallel_size_=1,
            group = torch.distributed.new_group(ranks)
            if rank in ranks:
                _DATA_PARALLEL_GROUP = group
+                _DATA_PARALLEL_GLOBAL_RANKS = ranks
    # Build the model-parallel groups.
    global _MODEL_PARALLEL_GROUP
@@ -478,11 +486,10 @@ def get_tensor_model_parallel_src_rank():
 def get_data_parallel_src_rank():
    """Calculate the global rank corresponding to the first local rank
-    in the tensor model parallel group."""
+    in the data parallel group."""
-    global_rank = torch.distributed.get_rank()
+    assert _DATA_PARALLEL_GLOBAL_RANKS is not None, \
-    data_parallel_size = get_data_parallel_world_size()
+        "Data parallel group is not initialized"
-    num_data_parallel_groups = torch.distributed.get_world_size() // data_parallel_size
+    return _DATA_PARALLEL_GLOBAL_RANKS[0]
-    return global_rank % num_data_parallel_groups
 def get_pipeline_model_parallel_first_rank():

--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -30,20 +30,21 @@ from .initialize import get_tensor_model_parallel_world_size
 from .initialize import get_tensor_model_parallel_group
 from .mappings import copy_to_tensor_model_parallel_region
 from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import gather_from_sequence_parallel_region
 from .mappings import reduce_from_tensor_model_parallel_region
 from .mappings import scatter_to_tensor_model_parallel_region
+from .mappings import reduce_scatter_to_sequence_parallel_region
 from .random import get_cuda_rng_tracker
 from .utils import divide
 from .utils import split_tensor_along_last_dim
 from .utils import VocabUtility
-from megatron import get_args
+from megatron import get_args, get_global_memory_buffer
 _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
                                      'partition_dim': -1,
                                      'partition_stride': 1}
 def param_is_not_tensor_parallel_duplicate(param):
    return (hasattr(param, 'tensor_model_parallel') and
            param.tensor_model_parallel) or (
@@ -201,16 +202,37 @@ class VocabParallelEmbedding(torch.nn.Module):
        return output
-class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
+class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
    """
-    Column-parallel linear layer execution with asynchronous all-reduce
+    Linear layer execution with asynchronous communication and gradient accumulation
-    execution in backprop.
+    fusion in backprop.
    """
    @staticmethod
-    def forward(ctx, input, weight, bias):
+    def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
+                async_grad_allreduce, sequence_parallel):
        ctx.save_for_backward(input, weight)
        ctx.use_bias = bias is not None
-        output = torch.matmul(input, weight.t())
+        ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
+        ctx.async_grad_allreduce = async_grad_allreduce
+        ctx.sequence_parallel = sequence_parallel
+        if sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+            all_gather_buffer = \
+                get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+            torch.distributed._all_gather_base(
+                all_gather_buffer,
+                input,
+                group=get_tensor_model_parallel_group())
+            total_input = all_gather_buffer
+        else:
+            total_input = input
+        output = torch.matmul(total_input, weight.t())
        if bias is not None:
            output = output + bias
        return output
@@ -219,17 +241,75 @@ class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
    def backward(ctx, grad_output):
        input, weight = ctx.saved_tensors
        use_bias = ctx.use_bias
+        if ctx.sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+            all_gather_buffer = \
+                get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+            handle = torch.distributed._all_gather_base(
+                all_gather_buffer,
+                input,
+                group=get_tensor_model_parallel_group(), async_op=True)
+            # Delay the start of intput gradient computation shortly (3us) to have
+            # gather scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
+            total_input = all_gather_buffer
+        else:
+            total_input = input
        grad_input = grad_output.matmul(weight)
-        # Asyncronous all-reduce
-        handle = torch.distributed.all_reduce(
+        if ctx.sequence_parallel:
-                grad_input, group=get_tensor_model_parallel_group(), async_op=True)
+            handle.wait()
-        # Delay the start of weight gradient computation shortly (3us) to have
-        # all-reduce scheduled first and have GPU resources allocated
+        # Convert the tensor shapes to 2D for execution compatibility
-        _ = torch.empty(1, device=grad_output.device) + 1
+        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
-        grad_weight = grad_output.t().matmul(input)
+                                       grad_output.shape[2])
+        total_input = total_input.view(total_input.shape[0] * total_input.shape[1],
+				       total_input.shape[2])
+        if ctx.async_grad_allreduce:
+            # Asynchronous all-reduce
+            handle = torch.distributed.all_reduce(
+                    grad_input, group=get_tensor_model_parallel_group(), async_op=True)
+            # Delay the start of weight gradient computation shortly (3us) to have
+            # all-reduce scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
+        if ctx.sequence_parallel:
+            assert not ctx.async_grad_allreduce
+            dim_size = list(input.size())
+            sub_grad_input = torch.empty(dim_size, dtype=input.dtype,
+                                         device=torch.cuda.current_device(),
+                                         requires_grad=False)
+            # reduce_scatter
+            handle = torch.distributed._reduce_scatter_base(sub_grad_input, grad_input, 
+                                                            group=get_tensor_model_parallel_group(),
+                                                            async_op=True)
+            # Delay the start of weight gradient computation shortly (3us) to have
+            # reduce scatter scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
+        if ctx.gradient_accumulation_fusion:
+            import fused_dense_cuda
+            fused_dense_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, weight.main_grad)
+            grad_weight = None
+        else:
+            grad_weight = grad_output.t().matmul(total_input)
        grad_bias = grad_output.sum(dim=0) if use_bias else None
-        handle.wait()
-        return grad_input, grad_weight, grad_bias
+        if ctx.sequence_parallel:
+            handle.wait()
+            return sub_grad_input, grad_weight, grad_bias, None, None, None
+        if ctx.async_grad_allreduce:
+            handle.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None
 class ColumnParallelLinear(torch.nn.Module):
@@ -242,7 +322,7 @@ class ColumnParallelLinear(torch.nn.Module):
        input_size: first dimension of matrix A.
        output_size: second dimension of matrix A.
        bias: If true, add bias
-        gather_output: If true, call all-gather on output and make Y avaiable
+        gather_output: If true, call all-gather on output and make Y available
                       to all GPUs, otherwise, every GPU will have its output
                       which is Y_i = XA_i
        init_method: method to initialize weights. Note that bias is always set
@@ -309,31 +389,30 @@ class ColumnParallelLinear(torch.nn.Module):
        else:
            self.register_parameter('bias', None)
        self.async_tensor_model_parallel_allreduce = (
-                not args.no_async_tensor_model_parallel_allreduce and
+                args.async_tensor_model_parallel_allreduce and
                world_size > 1)
+        self.sequence_parallel = (
+                args.sequence_parallel and
+                world_size > 1)
+        assert not self.async_tensor_model_parallel_allreduce or \
+            not self.sequence_parallel
+        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion
    def forward(self, input_):
        bias = self.bias if not self.skip_bias_add else None
-        if self.async_tensor_model_parallel_allreduce:
+        if self.async_tensor_model_parallel_allreduce or \
-            input_shape = input_.shape
+                self.sequence_parallel:
-            input_ = input_.view(input_shape[0] * input_shape[1],input_shape[2])
+            input_parallel = input_
-            # Maxtrix multiply with asynchronouse all-reduce execution
-            output_parallel = ColumnParallelLinearWithAsyncAllreduce.apply(
-                    input_, self.weight, bias)
-            output_parallel = output_parallel.view(
-                    input_shape[0], input_shape[1], output_parallel.shape[1])
        else:
-            # Set up backprop all-reduce.
            input_parallel = copy_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
-            # Matrix multiply.
+        output_parallel = LinearWithGradAccumulationAndAsyncCommunication.apply(
-            output_parallel = F.linear(input_parallel, self.weight, bias)
+            input_parallel, self.weight, bias, self.gradient_accumulation_fusion,
+            self.async_tensor_model_parallel_allreduce, self.sequence_parallel)
        if self.gather_output:
            # All-gather across the partitions.
+            assert not self.sequence_parallel
            output = gather_from_tensor_model_parallel_region(output_parallel)
        else:
            output = output_parallel
@@ -416,11 +495,15 @@ class RowParallelLinear(torch.nn.Module):
                self.bias = Parameter(torch.empty(
                    self.output_size, device=torch.cuda.current_device(),
                    dtype=args.params_dtype))
+            setattr(self.bias, 'sequence_parallel', args.sequence_parallel)
            # Always initialize bias to zero.
            with torch.no_grad():
                self.bias.zero_()
        else:
            self.register_parameter('bias', None)
+        self.sequence_parallel = args.sequence_parallel
+        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion
@@ -429,11 +512,17 @@ class RowParallelLinear(torch.nn.Module):
        if self.input_is_parallel:
            input_parallel = input_
        else:
+            assert not self.sequence_parallel
            input_parallel = scatter_to_tensor_model_parallel_region(input_)
        # Matrix multiply.
-        output_parallel = F.linear(input_parallel, self.weight)
+        output_parallel = LinearWithGradAccumulationAndAsyncCommunication.apply(
+            input_parallel, self.weight, None,
+            self.gradient_accumulation_fusion, None, None)
        # All-reduce across all the partitions.
-        output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+        if self.sequence_parallel:
+            output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
+        else:
+            output_ = reduce_from_tensor_model_parallel_region(output_parallel)
        if not self.skip_bias_add:
            output = output_ + self.bias if self.bias is not None else output_
            output_bias = None

--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
@@ -32,13 +32,13 @@ def _reduce(input_):
    return input_
-def _split(input_):
+def _split_along_last_dim(input_):
    """Split the tensor along its last dimension and keep the
    corresponding slice."""
    world_size = get_tensor_model_parallel_world_size()
    # Bypass the function if we are using only 1 GPU.
-    if world_size==1:
+    if world_size == 1:
        return input_
    # Split along last dimension.
@@ -51,12 +51,34 @@ def _split(input_):
    return output
-def _gather(input_):
+def _split_along_first_dim(input_):
+    """Split the tensor along its first dimension and keep the
+    corresponding slice."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+    # Split along first dimension.
+    dim_size = input_.size()[0]
+    assert dim_size % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    local_dim_size = dim_size // world_size
+    rank = get_tensor_model_parallel_rank()
+    dim_offset = rank * local_dim_size
+    output = input_[dim_offset:dim_offset+local_dim_size].contiguous()
+    return output
+def _gather_along_last_dim(input_):
    """Gather tensors and concatinate along the last dimension."""
    world_size = get_tensor_model_parallel_world_size()
    # Bypass the function if we are using only 1 GPU.
-    if world_size==1:
+    if world_size == 1:
        return input_
    # Size and dimension.
@@ -73,6 +95,44 @@ def _gather(input_):
    return output
+def _gather_along_first_dim(input_):
+    """Gather tensors and concatinate along the first dimension."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._all_gather_base(output, input_.contiguous(),
+                                       group=get_tensor_model_parallel_group())
+    return output
+def _reduce_scatter_along_first_dim(input_):
+    """Reduce-scatter the input tensor across model parallel group."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+    dim_size = list(input_.size())
+    assert dim_size[0] % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    dim_size[0] = dim_size[0] // world_size
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._reduce_scatter_base(output, input_.contiguous(), 
+                                           group=get_tensor_model_parallel_group())
+    return output
 class _CopyToModelParallelRegion(torch.autograd.Function):
    """Pass the input to the model parallel region."""
@@ -110,15 +170,15 @@ class _ScatterToModelParallelRegion(torch.autograd.Function):
    @staticmethod
    def symbolic(graph, input_):
-        return _split(input_)
+        return _split_along_last_dim(input_)
    @staticmethod
    def forward(ctx, input_):
-        return _split(input_)
+        return _split_along_last_dim(input_)
    @staticmethod
    def backward(ctx, grad_output):
-        return _gather(grad_output)
+        return _gather_along_last_dim(grad_output)
 class _GatherFromModelParallelRegion(torch.autograd.Function):
@@ -126,15 +186,73 @@ class _GatherFromModelParallelRegion(torch.autograd.Function):
    @staticmethod
    def symbolic(graph, input_):
-        return _gather(input_)
+        return _gather_along_last_dim(input_)
    @staticmethod
    def forward(ctx, input_):
-        return _gather(input_)
+        return _gather_along_last_dim(input_)
    @staticmethod
    def backward(ctx, grad_output):
-        return _split(grad_output)
+        return _split_along_last_dim(grad_output)
+class _ScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_first_dim(input_)
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_first_dim(input_)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+class _GatherFromSequenceParallelRegion(torch.autograd.Function):
+    """Gather the input from sequence parallel region and concatinate.""" 
+    @staticmethod
+    def symbolic(graph, input_, tensor_parallel_output_grad=True):
+        return _gather_along_first_dim(input_)
+    @staticmethod
+    def forward(ctx, input_, tensor_parallel_output_grad=True):
+        ctx.tensor_parallel_output_grad = tensor_parallel_output_grad
+        return _gather_along_first_dim(input_)
+    @staticmethod
+    def backward(ctx, grad_output):
+        tensor_parallel_output_grad = ctx.tensor_parallel_output_grad
+        # If the computation graph after the gather operation is
+        # in the tensor parallel mode, output gradients need to reduce 
+        # scattered and whereas if the computation is duplicated, 
+        # output gradients need to be scattered.
+        if tensor_parallel_output_grad:
+            return _reduce_scatter_along_first_dim(grad_output), None
+        else:
+            return _split_along_first_dim(grad_output), None
+class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Reduce scatter the input from the model parallel region."""
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce_scatter_along_first_dim(input_)
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce_scatter_along_first_dim(input_)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
 # -----------------
@@ -155,3 +273,16 @@ def scatter_to_tensor_model_parallel_region(input_):
 def gather_from_tensor_model_parallel_region(input_):
    return _GatherFromModelParallelRegion.apply(input_)
+def scatter_to_sequence_parallel_region(input_):
+    return _ScatterToSequenceParallelRegion.apply(input_)
+def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
+    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
+def reduce_scatter_to_sequence_parallel_region(input_):
+    return _ReduceScatterToSequenceParallelRegion.apply(input_)
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -307,10 +307,10 @@ class CheckpointFunction(torch.autograd.Function):
              tracked/set/reset.
    """
    @staticmethod
-    def forward(ctx, run_function, distribute_checkpointed_activations, *args):
+    def forward(ctx, run_function, distribute_saved_activations, *args):
        ctx.run_function = run_function
-        ctx.distribute_checkpointed_activations \
+        ctx.distribute_saved_activations \
-            = distribute_checkpointed_activations
+            = distribute_saved_activations
        # Copy the rng states.
        ctx.fwd_cpu_rng_state = torch.get_rng_state()
@@ -322,7 +322,7 @@ class CheckpointFunction(torch.autograd.Function):
        # Divide hidden states across model parallel group and only keep
        # the chunk corresponding to the current rank.
-        if distribute_checkpointed_activations:
+        if distribute_saved_activations:
            ctx.input_0_shape = args[0].data.shape
            safely_set_viewless_tensor_data(
                args[0],
@@ -339,7 +339,7 @@ class CheckpointFunction(torch.autograd.Function):
            raise RuntimeError("Checkpointing is not compatible with .grad(), "
                               "please use .backward() if possible")
        inputs = ctx.saved_tensors
-        if ctx.distribute_checkpointed_activations:
+        if ctx.distribute_saved_activations:
            safely_set_viewless_tensor_data(
                inputs[0],
                gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
@@ -372,8 +372,8 @@ class CheckpointFunction(torch.autograd.Function):
        return (None, None) + grads
-def checkpoint(function, distribute_checkpointed_activations, *args):
+def checkpoint(function, distribute_saved_activations, *args):
    """Checkpoint a model or part of the model.
    This has been directly copied from torch.utils.checkpoint."""
    return CheckpointFunction.apply(function,
-                                    distribute_checkpointed_activations, *args)
+                                    distribute_saved_activations, *args)
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -17,8 +17,8 @@ from apex.optimizers import FusedAdam as Adam
 from apex.optimizers import FusedSGD as SGD
 from megatron import get_args
-from megatron.model import LayerNorm
+from .distrib_optimizer import DistributedOptimizer
 from .grad_scaler import ConstantGradScaler, DynamicGradScaler
 from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
@@ -105,7 +105,11 @@ def get_megatron_optimizer(model,
    if args.DDP_impl == 'local':
        params_have_main_grad = True
-    if args.fp16 or args.bf16:
+    # Mixed precision optimizer.
+    # - Note: both the Float16Optimizer and the DistributedOptimizer inherit
+    #   from the MixedPrecisionOptimizer, which manages any optimizer where
+    #   the model params and main params are distinct.
+    if args.fp16 or args.bf16 or args.use_distributed_optimizer:
        # Grad scaler:
        #    if loss-scale is provided, instantiate the constant scaler.
@@ -114,9 +118,11 @@ def get_megatron_optimizer(model,
        #    otherwise we are running in bf16 with no loss-scale so
        #       leave it as None.
        grad_scaler = None
        # Constant loss scale.
        if args.loss_scale:
            grad_scaler = ConstantGradScaler(args.loss_scale)
        # Dynamic loss scale.
        else:
            if args.fp16:
@@ -129,16 +135,22 @@ def get_megatron_optimizer(model,
                    hysteresis=args.hysteresis)
        # Megatron optimizer.
-        return Float16OptimizerWithFloat16Params(optimizer,
+        opt_ty = DistributedOptimizer \
-                                                 args.clip_grad,
+            if args.use_distributed_optimizer else \
-                                                 args.log_num_zeros_in_grad,
+            Float16OptimizerWithFloat16Params
-                                                 params_have_main_grad,
+        return opt_ty(optimizer,
-                                                 args.use_contiguous_buffers_in_local_ddp,
+                      args.clip_grad,
-                                                 args.bf16,
+                      args.log_num_zeros_in_grad,
-                                                 grad_scaler)
+                      params_have_main_grad,
+                      args.use_contiguous_buffers_in_local_ddp,
+                      args.fp16,
+                      args.bf16,
+                      grad_scaler,
+                      model)
    # FP32.
    return FP32Optimizer(optimizer, args.clip_grad,
                         args.log_num_zeros_in_grad,
                         params_have_main_grad,
-                         args.use_contiguous_buffers_in_local_ddp)
+                         args.use_contiguous_buffers_in_local_ddp,
+                         model)
--- a/megatron/optimizer/clip_grads.py
+++ b/megatron/optimizer/clip_grads.py
@@ -21,12 +21,13 @@ from torch._six import inf
 from apex.multi_tensor_apply import multi_tensor_applier
 import amp_C
-from megatron import mpu
 from megatron.model.module import param_is_not_shared
 from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
-def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
+def clip_grad_norm_fp32(parameters, grads_for_norm,
+                        max_norm, norm_type=2,
+                        model_parallel_group=None):
    """Clips gradient norm of an iterable of parameters whose gradients
       are in fp32.
@@ -37,9 +38,13 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    Arguments:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized
+        grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single
+            Tensor that will be used for calculating the grad norm.
        max_norm (float or int): max norm of the gradients
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.
+        model_parallel_group (group): given the nature of the distributed
+            optimizer, this is passed as an argument.
    Returns:
        Total norm of the parameters (viewed as a single vector).
@@ -47,25 +52,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
+    if isinstance(grads_for_norm, torch.Tensor):
+        grads_for_norm = [grads_for_norm]
-    # Filter parameters based on:
+    # Grads.
-    #   - grad should not be none
-    #   - parameter should not be shared
-    #   - should not be a replica due to tensor model parallelism
    grads = []
-    grads_for_norm = []
    for param in parameters:
-        grad_not_none = param.grad is not None
+        if param.grad is not None:
-        is_not_shared = param_is_not_shared(param)
-        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
-        if grad_not_none:
-            grad = param.grad.detach()
-        if grad_not_none:
-            # Make sure the grads are in fp32
            assert param.grad.type() == 'torch.cuda.FloatTensor'
-            grads.append(grad)
+            grads.append(param.grad.detach())
-        if grad_not_none and is_not_shared and is_not_tp_duplicate:
-            grads_for_norm.append(grad)
    # Norm parameters.
    max_norm = float(max_norm)
@@ -79,7 +74,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
        # Take max across all model-parallel GPUs.
        torch.distributed.all_reduce(total_norm_cuda,
                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=mpu.get_model_parallel_group())
+                                     group=model_parallel_group)
        total_norm = total_norm_cuda[0].item()
    else:
@@ -88,12 +83,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
            # Use apex's multi-tensor applier for efficiency reasons.
            # Multi-tensor applier takes a function and a list of list
            # and performs the operation on that list all in one kernel.
-            grad_norm, _ = multi_tensor_applier(
+            if grads_for_norm:
-                amp_C.multi_tensor_l2norm,
+                grad_norm, _ = multi_tensor_applier(
-                dummy_overflow_buf,
+                    amp_C.multi_tensor_l2norm,
-                [grads_for_norm],
+                    dummy_overflow_buf,
-                False # no per-parameter norm
+                    [grads_for_norm],
-            )
+                    False # no per-parameter norm
+                )
+            else:
+                grad_norm = torch.cuda.FloatTensor([0])
            # Since we will be summing across data parallel groups,
            # we need the pow(norm-type).
            total_norm = grad_norm ** norm_type
@@ -106,7 +104,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
        # Sum across all model-parallel GPUs.
        torch.distributed.all_reduce(total_norm,
                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=mpu.get_model_parallel_group())
+                                     group=model_parallel_group)
        total_norm = total_norm.item() ** (1.0 / norm_type)
    # Scale.
@@ -121,7 +119,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    return total_norm
-def count_zeros_fp32(parameters):
+def count_zeros_fp32(parameters, model_parallel_group):
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
@@ -130,7 +128,7 @@ def count_zeros_fp32(parameters):
    #   - grad should not be none
    #   - parameter should not be shared
    #   - should not be a replica due to tensor model parallelism
-    total_num_zeros = 0.0
+    total_num_zeros = torch.cuda.FloatTensor([0.0])
    for param in parameters:
        grad_not_none = param.grad is not None
        is_not_shared = param_is_not_shared(param)
@@ -143,7 +141,8 @@ def count_zeros_fp32(parameters):
    # Sum across all model-parallel GPUs.
    torch.distributed.all_reduce(total_num_zeros,
                                 op=torch.distributed.ReduceOp.SUM,
-                                 group=mpu.get_model_parallel_group())
+                                 group=model_parallel_group)
    total_num_zeros = total_num_zeros.item()
    return total_num_zeros
--- a/megatron/optimizer/distrib_optimizer.py
+++ b/megatron/optimizer/distrib_optimizer.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron distributed optimizer."""
+import math
+import torch
+from megatron import get_args
+from megatron import get_timers
+from megatron import mpu
+from megatron import print_rank_0
+from megatron.model.module import param_is_not_shared
+from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
+from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper
+class Range:
+    """
+    A range represents a start and end points for indexing a shard
+    from a full tensor.
+    """
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+        self.size = end - start
+    def normalize(self, start = 0):
+        return Range(start, start + self.size)
+    def __str__(self):
+        return "%d,%d [%d]" % (self.start, self.end, self.size)
+class DistributedOptimizer(MixedPrecisionOptimizer):
+    """Distributed optimizer, for all data types (fp16, bf16, and fp32).
+    Arguments:
+        optimizer: base optimizer such as Adam or SGD
+        clip_grad: clip gradeints with this global L2 norm. Note
+            that clipping is ignored if clip_grad == 0
+        log_num_zeros_in_grad: return number of zeros in the gradients.
+        params_have_main_grad: flag indicating if parameters have
+            a `main_grad` field. If this is set, we are assuming
+            that the model parameters are store in the `main_grad`
+            field instead of the typical `grad` field. This happens
+            for the DDP cases where there is a continuous buffer
+            holding the gradients. For example for bfloat16, we want
+            to do gradient accumulation and all-reduces in float32
+            and as a result we store those gradients in the main_grad.
+            Note that main grad is not necessarily in float32.
+        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
+            is using a contiguous buffer to hold the model grads.
+        fp16: if true, the model is running in fp16.
+        bf16: if true, the model is running in bfloat16.
+        grad_scaler: used for scaling gradients. Note that this can be
+            None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constnat gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        models: list of models (i.e., the virtual pipelining models). This
+            is used by the distributed optimizer for mapping parameters.
+    """
+    @classmethod
+    def build_model_gbuf_param_range_map(cls, model, dtype, gbuf_world_range):
+        """
+        Build mapping from param reference to grad buffer shard ranges.
+        This method builds a mapping from parameter references to grad
+        buffer shard ranges, specific to each data-parallel (DP) rank's
+        set of 'owned' parameters. Each grad buffer (padded to be an even
+        multiple of DP-world-size) is conceptually divided into DP-world-size
+        contiguous regions, where each DP rank 'owns' a contiguous regions.
+        Ownership in this sense means DP rank is responsible for reducing
+        the relevant subset of grads, and updating the relevant subset of
+        params.
+        This conceptual partitioning of the grad buffer does NOT respect
+        parameter boundaries, and as such it is assumed that each created
+        range references a shard (or subset) of the full parameter. It is
+        easiest to think of each DP rank as operating (i.e., reducing,
+        gathering) purely on views into the grad buffer, for all model-to-
+        main & main-to-model operations.
+        This method creates three ranges:
+        - The param's range within the entire grad buffer (i.e., world index).
+        - The param's range within the DP rank's local view of the grad buffer.
+        - The param's range within itself (i.e., its shard).
+        """
+        # Param range map.
+        param_world_index_map = model._grad_buffer_param_index_map[dtype]
+        param_range_map = {}
+        for param, param_world_indexes in param_world_index_map.items():
+            # Param range.
+            param_world_start, param_world_end = param_world_indexes
+            param_local_start = max(
+                0,
+                param_world_start - gbuf_world_range.start)
+            param_local_end = min(
+                gbuf_world_range.size,
+                param_world_end - gbuf_world_range.start)
+            # Add param, if within local gbuf range.
+            if param_local_end > param_local_start:
+                param_local_range = Range(param_local_start, param_local_end)
+                param_world_range = param_local_range.normalize(
+                    param_local_start + gbuf_world_range.start)
+                sub_param_start = max(0, gbuf_world_range.start-param_world_start)
+                sub_param_range = param_local_range.normalize(sub_param_start)
+                param_range_map[param] = {
+                    "gbuf_world" : param_world_range,
+                    "gbuf_local" : param_local_range,
+                    "param" : sub_param_range,
+                }
+        return param_range_map
+    @classmethod
+    def build_model_gbuf_range(cls, model, dtype):
+        """
+        Build mapping between params and their grad buffers.
+        This method does the initial setup for the method above. This setup
+        includes determining the shard ranges into the DDP's grad buffer for
+        each data-parallel (DP) rank. Each DP rank keeps range info for
+        all other DP ranks, for the purpose of creating args for
+        reduce-scatter and all-gather.
+        """
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        # Grad buffer range.
+        grad_buffer = model._grad_buffers[dtype]
+        gbuf_size = grad_buffer.numel
+        max_gbuf_range_size = int(math.ceil(gbuf_size / data_parallel_world_size))
+        # All world ranges. (i.e., across all data parallel ranks)
+        gbuf_world_all_ranges = []
+        for r in range(data_parallel_world_size):
+            gbuf_world_start = r * max_gbuf_range_size
+            gbuf_world_end = min(gbuf_size, gbuf_world_start+max_gbuf_range_size)
+            gbuf_world_range = Range(gbuf_world_start, gbuf_world_end)
+            gbuf_world_all_ranges.append(gbuf_world_range)
+        # Local DP's ranges.
+        gbuf_world_range = gbuf_world_all_ranges[data_parallel_rank]
+        gbuf_local_range = gbuf_world_range.normalize()
+        # Get each param's ranges.
+        param_range_map = cls.build_model_gbuf_param_range_map(model,
+                                                               dtype,
+                                                               gbuf_world_range)
+        # Group into dict.
+        data = {
+            "local" : gbuf_local_range,
+            "world" : gbuf_world_range,
+            "world_all" : gbuf_world_all_ranges,
+            "param_map" : param_range_map,
+            "max_range_size" : max_gbuf_range_size,
+        }
+        return data
+    @classmethod
+    def build_model_gbuf_range_map(cls, model):
+        """
+        Create param-to-grad-buffer mappings, for grad buffer data types
+        within a specific virtual model.
+        """
+        return {
+            dtype : cls.build_model_gbuf_range(model, dtype)
+            for dtype in model._grad_buffers
+        }
+    @classmethod
+    def build_model_param_gbuf_map(cls, model_gbuf_ranges):
+        """
+        Create a reverse of the model_gbuf_ranges, for referencing in
+        opposite direction.
+        """
+        param_gbuf_map = {}
+        for model_index, model_gbuf_range_map in enumerate(model_gbuf_ranges):
+            for dtype, gbuf_range_map in model_gbuf_range_map.items():
+                for param, param_range_map in gbuf_range_map["param_map"].items():
+                    param_gbuf_map[param] = (model_index, dtype)
+        return param_gbuf_map
+    @classmethod
+    def build_optimizer_group_ranges(cls, param_groups, model_gbuf_ranges):
+        """
+        Create optimizer groups.
+        Given the set of parameter shard ranges that are owned by the current
+        data-parallel (DP) rank, gather the set of parameters that will be
+        used (in the method below) to create the current DP's optimizer
+        groups.
+        """
+        num_groups = len(param_groups)
+        # Param group map.
+        param_group_map = {}
+        for group_index, group in enumerate(param_groups):
+            for param in group["params"]:
+                assert param.requires_grad
+                param_group_map[param] = group_index
+        # Optimizer group ranges.
+        group_ranges = [ {"params": []} for _ in param_groups ]
+        for model_gbuf_range_map in model_gbuf_ranges:
+            for dtype, gbuf_range_map in model_gbuf_range_map.items():
+                for param in gbuf_range_map["param_map"]:
+                    group_index = param_group_map[param]
+                    group_range = group_ranges[group_index]
+                    group_range["params"].append(param)
+        # Squeeze zero-size group ranges.
+        for group_index, group_range in enumerate(group_ranges):
+            group_range["orig_group"] = param_groups[group_index]
+        group_ranges = [ g for g in group_ranges if len(g["params"]) > 0 ]
+        return group_ranges
+    @classmethod
+    def build_model_and_main_param_groups(cls,
+                                          model_gbuf_ranges,
+                                          param_gbuf_map,
+                                          opt_group_ranges):
+        """
+        Create main parameter groups needed for the optimizer step.
+        These groups encompass both: 1) groups used by this class, for
+        reducing/gather, and 2) groups used by the inner optimizer for the
+        parameter update. Given that the conceptual grad buffer partitioning
+        (created in earlier method) doesn't respect parameter boundaries,
+        the optimizer operates on shards of the model parameters, rather than
+        the full parameters.
+        """
+        # Parameter groups:
+        #   model_float16_groups: original float16 parameters
+        #   model_fp32_groups: original fp32 parameters
+        #   shard_float16_groups: shards of original float16 parameters
+        #   shard_fp32_groups: shards of original fp32 parameters
+        #   shard_fp32_from_float16_groups: fp32 copy of float16 parameters
+        model_float16_groups = []
+        model_fp32_groups = []
+        shard_float16_groups = []
+        shard_fp32_groups = []
+        shard_fp32_from_float16_groups = []
+        # Allocate (or slice) each group's param shard.
+        for group_index, group_range in enumerate(opt_group_ranges):
+            # Params of this group.
+            model_float16_params_this_group = []
+            model_fp32_params_this_group = []
+            shard_float16_params_this_group = []
+            shard_fp32_params_this_group = []
+            shard_fp32_from_float16_params_this_group = []
+            model_float16_groups.append(model_float16_params_this_group)
+            model_fp32_groups.append(model_fp32_params_this_group)
+            shard_float16_groups.append(shard_float16_params_this_group)
+            shard_fp32_groups.append(shard_fp32_params_this_group)
+            shard_fp32_from_float16_groups.append(
+                shard_fp32_from_float16_params_this_group)
+            for model_param in group_range["params"]:
+                assert model_param.requires_grad
+                model_index, dtype = param_gbuf_map[model_param]
+                gbuf_range = model_gbuf_ranges[model_index][dtype]
+                param_range = gbuf_range["param_map"][model_param]["param"]
+                # fp16, bf16 params.
+                if model_param.type() in ['torch.cuda.HalfTensor',
+                                          'torch.cuda.BFloat16Tensor']:
+                    # Clone model -> main.
+                    shard_model_param = model_param.detach().view(-1) \
+                        [param_range.start:param_range.end]
+                    shard_main_param = shard_model_param.clone().float()
+                    mpu.copy_tensor_model_parallel_attributes(
+                        shard_model_param, model_param)
+                    mpu.copy_tensor_model_parallel_attributes(
+                        shard_main_param, model_param)
+                    if hasattr(model_param, 'shared'):
+                        shard_model_param.shared = model_param.shared
+                        shard_main_param.shared = model_param.shared
+                    # Add to group.
+                    model_float16_params_this_group.append(model_param)
+                    shard_float16_params_this_group.append(shard_model_param)
+                    shard_fp32_from_float16_params_this_group.append(shard_main_param)
+                # fp32 params.
+                elif model_param.type() == 'torch.cuda.FloatTensor':
+                    shard_model_param = model_param.view(-1) \
+                        [param_range.start:param_range.end]
+                    model_fp32_params_this_group.append(model_param)
+                    shard_fp32_params_this_group.append(shard_model_param)
+                    mpu.copy_tensor_model_parallel_attributes(
+                        shard_model_param, model_param)
+                    if hasattr(model_param, 'shared'):
+                        shard_model_param.shared = model_param.shared
+                else:
+                    raise TypeError('Wrapped parameters must be one of '
+                                    'torch.cuda.FloatTensor,  '
+                                    'torch.cuda.HalfTensor, or '
+                                    'torch.cuda.BFloat16Tensor. '
+                                    'Received {}'.format(param.type()))
+            # Update optimizer's params.
+            group_range["orig_group"]["params"] = [
+                *shard_fp32_params_this_group,
+                *shard_fp32_from_float16_params_this_group,
+            ]
+        return (
+            model_float16_groups,
+            model_fp32_groups,
+            shard_float16_groups,
+            shard_fp32_groups,
+            shard_fp32_from_float16_groups,
+        )
+    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
+                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+                 fp16, bf16, grad_scaler, models):
+        """
+        See top of class definition for argument descriptions.
+        The steps in this method create the core mapping between DDP grad
+        buffers, parameters, and parameter shard ranges, that is needed for
+        converting between model param indexes and main parameter shard
+        indexes. This method also updates the optimizer parameter groups
+        with the newly created shards.
+        """
+        super().__init__(
+            optimizer, clip_grad, log_num_zeros_in_grad,
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            fp16, bf16, grad_scaler, models)
+        # Verify that contiguous buffers are being used.
+        # - Note: this should already be checked in arguments.py.
+        assert use_contiguous_buffers_in_local_ddp
+        # Model grad buffer ranges.
+        self.model_gbuf_ranges = []
+        for model_index, model in enumerate(self.models):
+            self.model_gbuf_ranges.append(self.build_model_gbuf_range_map(model))
+        self.model_param_gbuf_map = \
+            self.build_model_param_gbuf_map(self.model_gbuf_ranges)
+        # Optimizer ranges.
+        self.opt_group_ranges = self.build_optimizer_group_ranges(
+            self.optimizer.param_groups,
+            self.model_gbuf_ranges)
+        # Allocate main param shards.
+        (
+            self.model_float16_groups,
+            self.model_fp32_groups,
+            self.shard_float16_groups,
+            self.shard_fp32_groups,
+            self.shard_fp32_from_float16_groups,
+        ) = self.build_model_and_main_param_groups(self.model_gbuf_ranges,
+                                                   self.model_param_gbuf_map,
+                                                   self.opt_group_ranges)
+        # Update optimizer groups.
+        # - Also, leverage state_dict() and load_state_dict() to
+        #   recast preexisting per-param state tensors.
+        self.optimizer.param_groups = \
+            [ g["orig_group"] for g in self.opt_group_ranges ]
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+    def get_model_param_range_map(self, param):
+        """
+        Given a model param, get the index sub-range of the param that this
+        data-parallel rank owns.
+        """
+        model_index, dtype = self.model_param_gbuf_map[param]
+        gbuf_range_map = self.model_gbuf_ranges[model_index][dtype]
+        param_range_map = gbuf_range_map["param_map"][param]
+        return param_range_map
+    def get_model_parallel_group(self):
+        """
+        With the distributed optimizer, the model parallel group is the
+        entire world.
+        """
+        return None
+    def state_dict(self):
+        """
+        The state dict must contain the fp32-from-float16 shards.
+        """
+        state_dict = {}
+        state_dict['optimizer'] = self.optimizer.state_dict()
+        if self.grad_scaler:
+            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
+        state_dict['shard_fp32_from_float16_groups'] = \
+            self.shard_fp32_from_float16_groups
+        return state_dict
+    def load_state_dict(self, state_dict):
+        """
+        Load the state dict.
+        """
+        # Optimizer.
+        optimizer_key = 'optimizer'
+        if optimizer_key not in state_dict:
+            optimizer_key = 'optimizer_state_dict'
+            print_rank_0('***WARNING*** loading optimizer from '
+                         'an old checkpoint ...')
+        self.optimizer.load_state_dict(state_dict[optimizer_key])
+        # Grad scaler.
+        if 'grad_scaler' not in state_dict:
+            print_rank_0('***WARNING*** found an old checkpoint, will not '
+                         'load grad scaler ...')
+        else:
+            if self.grad_scaler:
+                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
+            else:
+                print_rank_0('***WARNING*** fould the grad scaler in the '
+                             'checkpoint but it is None in the class. '
+                             'Skipping loading grad scaler ...')
+        # Copy data for the main params.
+        for current_group, saved_group in zip(
+                self.shard_fp32_from_float16_groups,
+                state_dict["shard_fp32_from_float16_groups"]):
+            for current_param, saved_param in zip(current_group, saved_group):
+                current_param.data.copy_(saved_param.data)
+    def zero_grad(self, set_to_none=True):
+        """
+        Zero grads.
+        We only need to zero the model related parameters, i.e.,
+        model_float16_groups & model_fp32_groups. We additionally zero
+        the remaining groups as a memory optimization to reduce
+        fragmentation; in the case of set_to_none==True, the space
+        used by this field can be safely deallocated at this point.
+        """
+        for groups in (
+                self.model_float16_groups,
+                self.model_fp32_groups,
+                self.shard_float16_groups, # grad empty/unused here?
+                self.shard_fp32_groups, # throws grad-access warning
+                self.shard_fp32_from_float16_groups):
+            for group in groups:
+                _zero_grad_group_helper(group, set_to_none)
+    def get_model_grad_buffer_dp_views(self):
+        """
+        Get shard views of each of the DDP's grad buffers.
+        In this nested list, the top level is grouped by the virtual model
+        index and the grad buffer's data type. The sub-level is a list of
+        shards of that grad buffer, where each shard in the list represents
+        a contiguous view of the grad buffer, that is owned by a data-parallel
+        rank. The shard boundary does not respect parameter boundaries, and
+        so the elements of some parameters are split across data parallel
+        ranks.
+        Additionally, return references to the entire grad buffers, for use
+        in _reduce_scatter_base and _all_gather_base.
+        """
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        # Grad buffer views.
+        gbuf_view_items = []
+        for model_index, model in enumerate(self.models):
+            for dtype, gbuf in model._grad_buffers.items():
+                assert gbuf.numel_padded % data_parallel_world_size == 0
+                shard_size = int(gbuf.numel_padded / data_parallel_world_size)
+                gbuf_views = [gbuf.data[(r*shard_size):((r+1)*shard_size)]
+                              for r in range(data_parallel_world_size)]
+                gbuf_view_items.append((model_index, dtype, gbuf.data, gbuf_views))
+        return gbuf_view_items
+    def reduce_model_grads(self, args, timers):
+        """
+        Reduce-scatter model grads.
+        The DDP's grad buffer is used for the reduce-scatter, and thus no
+        tensors are dynamically allocated.
+        Note: this is a different order of reduction, versus the non-
+        distributed optimizer, which reduces: 1) layernorm grads, 2) all
+        grads, 3) embedding grads.
+        """
+        # All-reduce layer-norm grads (for sequence parallelism).
+        timers('backward-layernorm-all-reduce').start()
+        self.allreduce_layernorm_grads(args)
+        timers('backward-layernorm-all-reduce').stop()
+        # All-reduce embedding grads.
+        timers('backward-embedding-all-reduce').start()
+        self.allreduce_embedding_grads(args)
+        timers('backward-embedding-all-reduce').stop()
+        # Reduce-scatter setup.
+        timers('backward-params-all-reduce').start()
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        data_parallel_group = mpu.get_data_parallel_group()
+        # Scale grad buffers by '1 / data_parallel_world_size'.
+        for model in self.models:
+            for dtype, gbuf in model._grad_buffers.items():
+                gbuf.data /= data_parallel_world_size
+        # Reduce-scatter all grads.
+        gbuf_view_items = self.get_model_grad_buffer_dp_views()
+        for index, (model_index, dtype, gbuf, gbuf_views) \
+            in enumerate(gbuf_view_items):
+            torch.distributed._reduce_scatter_base(
+                gbuf_views[data_parallel_rank],
+                gbuf,
+                group = data_parallel_group,
+            )
+        timers('backward-params-all-reduce').stop()
+    def gather_model_params(self, args, timers):
+        """
+        All-gather updated model params.
+        The DDP's grad buffer is used for the all-gather, and thus no
+        tensors are dynamically allocated. After the all-gather, the params
+        can be copied from param.main_grad to param.
+        """
+        timers('backward-params-all-gather').start()
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_group = mpu.get_data_parallel_group()
+        # All-gather updated main params.
+        # - All grad buffer views are guaranteed to have the same num elements
+        #   across all data parallel ranks, with grad buffer padding that is done
+        #   in distributed.py. Thus, all sub-views will have consistent start/end
+        #   indexes across data parallel ranks.
+        gbuf_view_items = self.get_model_grad_buffer_dp_views()
+        for index, (model_index, dtype, gbuf, gbuf_views) \
+            in enumerate(gbuf_view_items):
+            torch.distributed._all_gather_base(
+                gbuf,
+                gbuf_views[data_parallel_rank],
+                group = data_parallel_group,
+            )
+        # Each model param now contains its updated values in its
+        # '.main_grad' field.
+        for model in self.models:
+            for dtype, param_map in model._grad_buffer_param_index_map.items():
+                for param in param_map:
+                    param.detach().copy_(param.main_grad)
+        timers('backward-params-all-gather').stop()
+    def _collect_main_grad_data_for_unscaling(self):
+        """
+        Note: this should be equivalent to the float-16 optimizer's method,
+        but writtent differently, so the two should be combined.
+        """
+        return [
+            param.grad.data
+            for group in self.optimizer.param_groups
+            for param in group["params"]
+        ]
+    def _get_model_and_main_params_data_float16(self):
+        """
+        Get aligned list of model and main params.
+        """
+        model_data = []
+        main_data = []
+        for model_group, main_group in zip(self.shard_float16_groups,
+                                           self.shard_fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                model_data.append(model_param.data)
+                main_data.append(main_param.data)
+        return model_data, main_data
+    def _copy_model_grads_to_main_grads(self):
+        """
+        Copy model grads to main grads.
+        Since this step follows a reduce-scatter through the DDP's grad
+        buffer, this method is responsible for copying the updated grads
+        from the grad buffer to the main shard's grad field.
+        """
+        # Utility method for copying group grads.
+        def copy_group_grads(model_groups, shard_main_groups):
+            for model_group, shard_main_group in zip(model_groups,
+                                                     shard_main_groups):
+                for model_param, shard_main_param in zip(model_group,
+                                                         shard_main_group):
+                    param_range_map = self.get_model_param_range_map(model_param)
+                    param_range = param_range_map["param"]
+                    assert param_range.size == shard_main_param.nelement()
+                    model_grad = model_param.main_grad
+                    shard_model_grad = model_grad.view(-1) \
+                        [param_range.start:param_range.end]
+                    shard_main_param.grad = shard_model_grad.float()
+        # Copy model groups to shard groups.
+        copy_group_grads(self.model_float16_groups,
+                         self.shard_fp32_from_float16_groups)
+        copy_group_grads(self.model_fp32_groups,
+                         self.shard_fp32_groups)
+    def _copy_main_params_to_model_params(self):
+        """
+        Copy main params to model params.
+        Since this step is followed by an all-gather through the DDP's grad
+        buffer, this method is responsible for copying the updated params
+        from the main shards into the correct position in the grad buffer.
+        """
+        # Utility method for copying group params.
+        def copy_group_params(shard_main_groups, model_groups):
+            for shard_main_group, model_group in zip(shard_main_groups,
+                                                     model_groups):
+                for shard_main_param, model_param in zip(shard_main_group,
+                                                         model_group):
+                    param_range_map = self.get_model_param_range_map(model_param)
+                    param_range = param_range_map["param"]
+                    assert param_range.size == shard_main_param.nelement()
+                    model_grad = model_param.main_grad
+                    shard_model_grad = model_grad.view(-1) \
+                        [param_range.start:param_range.end]
+                    shard_model_grad.data.copy_(shard_main_param)
+        # Copy shard groups to model groups.
+        copy_group_params(self.shard_fp32_from_float16_groups,
+                          self.model_float16_groups)
+        copy_group_params(self.shard_fp32_groups,
+                          self.model_fp32_groups)
--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -17,15 +17,20 @@
 from abc import ABC
 from abc import abstractmethod
-import torch
 from apex.multi_tensor_apply import multi_tensor_applier
 import amp_C
+import torch
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from megatron import get_timers
 from megatron import mpu
 from megatron import print_rank_0
+from megatron.model import DistributedDataParallel as LocalDDP
+from megatron.model import Float16Module
+from megatron.model.module import param_is_not_shared
+from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
+from megatron.utils import unwrap_model
 from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
@@ -69,7 +74,8 @@ class MegatronOptimizer(ABC):
    def __init__(self, optimizer, clip_grad,
                 log_num_zeros_in_grad,
                 params_have_main_grad,
-                 use_contiguous_buffers_in_local_ddp):
+                 use_contiguous_buffers_in_local_ddp,
+                 models):
        """Input optimizer is the base optimizer for example Adam."""
        self.optimizer = optimizer
@@ -80,10 +86,15 @@ class MegatronOptimizer(ABC):
        self.params_have_main_grad = params_have_main_grad
        self.use_contiguous_buffers_in_local_ddp = use_contiguous_buffers_in_local_ddp
+        # 'models' are retained for access to the contiguous grad buffers.
+        # (see distributed optimizer)
+        self.models = models
        if self.use_contiguous_buffers_in_local_ddp:
            assert self.params_have_main_grad, \
                "use of contiguous buffer requires that params have main grad"
    def get_parameters(self):
        params = []
        for param_group in self.optimizer.param_groups:
@@ -92,14 +103,42 @@ class MegatronOptimizer(ABC):
        return params
+    def get_main_grads_for_grad_norm(self):
+        # Filter parameters based on:
+        #   - grad should not be none
+        #   - parameter should not be shared
+        #   - should not be a replica due to tensor model parallelism
+        params = self.get_parameters()
+        grads_for_norm = []
+        for param in params:
+            grad = param.grad
+            grad_not_none = grad is not None
+            is_not_shared = param_is_not_shared(param)
+            is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+            if grad_not_none and is_not_shared and is_not_tp_duplicate:
+                grads_for_norm.append(grad)
+        return grads_for_norm
+    def get_model_parallel_group(self):
+        """Default returned here, but the distributed optimizer overrides this."""
+        return mpu.get_model_parallel_group()
    def clip_grad_norm(self, clip_grad):
        params = self.get_parameters()
-        return clip_grad_norm_fp32(params, clip_grad)
+        grads_for_norm = self.get_main_grads_for_grad_norm()
+        return clip_grad_norm_fp32(
+            params, grads_for_norm, clip_grad,
+            model_parallel_group=self.get_model_parallel_group())
    def count_zeros(self):
        params = self.get_parameters()
-        return count_zeros_fp32(params)
+        return count_zeros_fp32(params,
+                                model_parallel_group=self.get_model_parallel_group())
    @abstractmethod
@@ -118,11 +157,6 @@ class MegatronOptimizer(ABC):
        return self.get_loss_scale() * loss
-    @abstractmethod
-    def step(self):
-        pass
    @abstractmethod
    def reload_model_params(self):
        """Refreshes any internal state from the current model parameters.
@@ -166,9 +200,119 @@ class MegatronOptimizer(ABC):
    param_groups = property(_get_param_groups, _set_param_groups)
+    @abstractmethod
+    def step(self, args, timers):
+        pass
-class Float16OptimizerWithFloat16Params(MegatronOptimizer):
-    """Float16 optimizer for fp16 and bf16 data types.
+    def gather_model_params(self, args, timers):
+        """
+        For the case of a non-distributed-optimizer, there is nothing to
+        do here.
+        """
+        pass
+    def allreduce_word_embedding_grads(self, args):
+        """
+        All-reduce word embedding grads.
+        Reduce grads across first and last stages to ensure that word_embeddings
+        parameters stay in sync. This should only run for models that support
+        pipelined model parallelism (BERT and GPT-2).
+        """
+        if mpu.is_rank_in_embedding_group(ignore_virtual=True) and \
+                mpu.get_pipeline_model_parallel_world_size() > 1:
+            if mpu.is_pipeline_first_stage(ignore_virtual=True):
+                unwrapped_model = self.models[0]
+            elif mpu.is_pipeline_last_stage(ignore_virtual=True):
+                unwrapped_model = self.models[-1]
+            else:  # We do not support the interleaved schedule for T5 yet.
+                unwrapped_model = self.models[0]
+            unwrapped_model = unwrap_model(
+                unwrapped_model, (torchDDP, LocalDDP, Float16Module))
+            if unwrapped_model.share_word_embeddings:
+                word_embeddings_weight = unwrapped_model.word_embeddings_weight()
+                if args.DDP_impl == 'local':
+                    grad = word_embeddings_weight.main_grad
+                else:
+                    grad = word_embeddings_weight.grad
+                torch.distributed.all_reduce(grad, group=mpu.get_embedding_group())
+    def allreduce_position_embedding_grads(self, args):
+        """
+        All-reduce position_embeddings grad across first (encoder) and
+        split (decoder) stages to ensure that position embeddings parameters
+        stay in sync. This should only run for T5 models with pipeline
+        parallelism.
+        """
+        if mpu.is_rank_in_position_embedding_group() and \
+                mpu.get_pipeline_model_parallel_world_size() > 1 and \
+                args.pipeline_model_parallel_split_rank is not None:
+            unwrapped_model = self.models[0]
+            unwrapped_model = unwrap_model(
+                unwrapped_model, (torchDDP, LocalDDP, Float16Module))
+            assert args.DDP_impl == 'local', \
+                'T5 model is only supported with local DDP mode'
+            grad = unwrapped_model.language_model.embedding.position_embeddings.weight.main_grad
+            torch.distributed.all_reduce(grad, group=mpu.get_position_embedding_group())
+    def allreduce_embedding_grads(self, args):
+        """All-reduce both word and position embeddings."""
+        self.allreduce_word_embedding_grads(args)
+        self.allreduce_position_embedding_grads(args)
+    def allreduce_layernorm_grads(self, args):
+        """All-reduce layernorm grads (for sequence parallelism)."""
+        # All-reduce layernorm parameters across model parallel nodes
+        # when sequence parallelism is used
+        if mpu.get_tensor_model_parallel_world_size() > 1 and \
+                args.sequence_parallel:
+            grads = []
+            for model_module in self.models:
+                unwrapped_model = unwrap_model( 
+                    model_module, (torchDDP, LocalDDP, Float16Module))
+                for param in unwrapped_model.parameters():
+                    if getattr(param, 'sequence_parallel', False):
+                        grad = param.main_grad if args.DDP_impl == 'local' else param.grad
+                        grads.append(grad.data)
+            coalesced = _flatten_dense_tensors(grads)
+            torch.distributed.all_reduce(
+                coalesced, group=mpu.get_tensor_model_parallel_group())
+            for buf, synced in zip(grads, _unflatten_dense_tensors(
+                    coalesced, grads)):
+                buf.copy_(synced)
+    def reduce_model_grads(self, args, timers):
+        """All-reduce all grads, and all-reduce embeddings."""
+        # All-reduce layer-norm grads (for sequence parallelism).
+        timers('backward-layernorm-all-reduce').start()
+        self.allreduce_layernorm_grads(args)
+        timers('backward-layernorm-all-reduce').stop()
+        # All-reduce if needed.
+        if args.DDP_impl == 'local':
+            timers('backward-params-all-reduce').start()
+            for model in self.models:
+                model.allreduce_gradients()
+            timers('backward-params-all-reduce').stop()
+        # All-reduce embedding grads.
+        timers('backward-embedding-all-reduce').start()
+        self.allreduce_embedding_grads(args)
+        timers('backward-embedding-all-reduce').stop()
+class MixedPrecisionOptimizer(MegatronOptimizer):
+    """Base class for both the float-16 and the distributed optimizer.
    Arguments:
        optimizer: base optimizer such as Adam or SGD
@@ -184,27 +328,36 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
            to do gradient accumulation and all-reduces in float32
            and as a result we store those gradients in the main_grad.
            Note that main grad is not necessarily in float32.
+        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
+            is using a contiguous buffer to hold the model grads.
+        fp16: if true, the model is running in fp16.
        bf16: if true, the model is running in bfloat16.
        grad_scaler: used for scaling gradients. Note that this can be
            None. This case happens when `bf16 = True` and we don't
            use any loss scale. Note that for `bf16 = True`, we can have
            a constnat gradient scaler. Also for `bf16 = False`, we
            always require a grad scaler.
+        models: list of models (i.e., the virtual pipelining models). This
+            is used by the distributed optimizer for mapping parameters.
    """
    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-                 bf16, grad_scaler):
+                 fp16, bf16, grad_scaler,
+                 models):
-        super(Float16OptimizerWithFloat16Params, self).__init__(
+        super().__init__(
            optimizer, clip_grad, log_num_zeros_in_grad,
-            params_have_main_grad, use_contiguous_buffers_in_local_ddp)
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            models)
+        self.fp16 = fp16
        self.bf16 = bf16
        self.grad_scaler = grad_scaler
        # None grad scaler is only supported for bf16.
        if self.grad_scaler is None:
-            assert self.bf16, 'fp16 expects a grad scaler.'
+            assert not self.fp16, 'fp16 expects a grad scaler.'
        # Tensor used to determine if a nan/if has happend.
        # Any non-zero value indicates inf/nan.
@@ -225,6 +378,131 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
        if self.grad_scaler is None:
            self._scale_one = torch.cuda.FloatTensor([1.0])
+    def get_loss_scale(self):
+        if self.grad_scaler is None:
+            return self._scale_one
+        return self.grad_scaler.scale
+    def reload_model_params(self):
+        self._copy_model_params_to_main_params()
+    def _unscale_main_grads_and_check_for_nan(self):
+        # Collect main grads.
+        main_grads = self._collect_main_grad_data_for_unscaling()
+        # Reset found inf.
+        self.found_inf.fill_(0.0)
+        # Unscale and set found inf/nan
+        torch._amp_foreach_non_finite_check_and_unscale_(
+            main_grads, self.found_inf, self.grad_scaler.inv_scale)
+        # Update across all model parallel instances.
+        torch.distributed.all_reduce(self.found_inf,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=self.get_model_parallel_group())
+        # Check for nan.
+        found_inf_flag = (self.found_inf.item() > 0)
+        return found_inf_flag
+    @torch.no_grad()
+    def step(self, args, timers):
+        # Copy gradients from model params to main params.
+        timers('optimizer-copy-to-main-grad').start()
+        self._copy_model_grads_to_main_grads()
+        timers('optimizer-copy-to-main-grad').stop()
+        # Do unscale, check for inf, and update grad scaler only for
+        # the case that grad scaler is provided.
+        if self.grad_scaler:
+            # Unscale and check for inf/nan.
+            timers('optimizer-unscale-and-check-inf').start()
+            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
+            timers('optimizer-unscale-and-check-inf').stop()
+            # We are done with scaling gradients
+            # so we can update the loss scale.
+            self.grad_scaler.update(found_inf_flag)
+            # If we found inf/nan, skip the update.
+            if found_inf_flag:
+                return False, None, None
+        # Clip the main gradients.
+        timers('optimizer-clip-main-grad').start()
+        grad_norm = None
+        if self.clip_grad > 0.0:
+            grad_norm = self.clip_grad_norm(self.clip_grad)
+        timers('optimizer-clip-main-grad').stop()
+        # Count the zeros in the grads.
+        timers('optimizer-count-zeros').start()
+        num_zeros_in_grad = self.count_zeros() if \
+                            self.log_num_zeros_in_grad else None
+        timers('optimizer-count-zeros').stop()
+        # Step the optimizer.
+        timers('optimizer-inner-step').start()
+        self.optimizer.step()
+        timers('optimizer-inner-step').stop()
+        # Update params from main params.
+        timers('optimizer-copy-main-to-model-params').start()
+        self._copy_main_params_to_model_params()
+        timers('optimizer-copy-main-to-model-params').stop()
+        # Successful update.
+        return True, grad_norm, num_zeros_in_grad
+class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
+    """Float16 optimizer for fp16 and bf16 data types.
+    Arguments:
+        optimizer: base optimizer such as Adam or SGD
+        clip_grad: clip gradeints with this global L2 norm. Note
+            that clipping is ignored if clip_grad == 0
+        log_num_zeros_in_grad: return number of zeros in the gradients.
+        params_have_main_grad: flag indicating if parameters have
+            a `main_grad` field. If this is set, we are assuming
+            that the model parameters are store in the `main_grad`
+            field instead of the typical `grad` field. This happens
+            for the DDP cases where there is a continuous buffer
+            holding the gradients. For example for bfloat16, we want
+            to do gradient accumulation and all-reduces in float32
+            and as a result we store those gradients in the main_grad.
+            Note that main grad is not necessarily in float32.
+        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
+            is using a contiguous buffer to hold the model grads.
+        fp16: if true, the model is running in fp16.
+        bf16: if true, the model is running in bfloat16.
+        grad_scaler: used for scaling gradients. Note that this can be
+            None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constnat gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        models: list of models (i.e., the virtual pipelining models). This
+            is used by the distributed optimizer for mapping parameters.
+    """
+    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
+                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+                 fp16, bf16, grad_scaler, models):
+        super().__init__(
+            optimizer, clip_grad, log_num_zeros_in_grad,
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            fp16, bf16, grad_scaler, models)
        # ======================
        # main parameter stuff
        # ======================
@@ -259,12 +537,12 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
                            main_param.shared = param.shared
                        # Replace the optimizer params with the new fp32 copy.
                        param_group['params'][i] = main_param
                        fp32_from_float16_params_this_group.append(main_param)
                        # Reset existing state dict key to the new main param.
                        if param in self.optimizer.state:
                            self.optimizer.state[main_param] \
                                = self.optimizer.state.pop(param)
                    # fp32 params.
                    elif param.type() == 'torch.cuda.FloatTensor':
                        fp32_params_this_group.append(param)
@@ -282,10 +560,6 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
                fp32_from_float16_params_this_group)
            self.fp32_from_fp32_groups.append(fp32_params_this_group)
-        # Leverage state_dict() and load_state_dict() to
-        # recast preexisting per-param state tensors
-        self.optimizer.load_state_dict(self.optimizer.state_dict())
    def zero_grad(self, set_to_none=True):
        """We only need to zero the model related parameters, i.e.,
@@ -301,10 +575,34 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
            _zero_grad_group_helper(group, set_to_none)
-    def get_loss_scale(self):
+    def _collect_main_grad_data_for_unscaling(self):
-        if self.grad_scaler is None:
-            return self._scale_one
+        main_grads = []
-        return self.grad_scaler.scale
+        # fp32 params from float16 ones.
+        for main_group in self.fp32_from_float16_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+        # Append fp32 parameters.
+        for main_group in self.fp32_from_fp32_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+        return main_grads
+    def _get_model_and_main_params_data_float16(self):
+        model_data = []
+        main_data = []
+        for model_group, main_group in zip(self.float16_groups,
+                                           self.fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                model_data.append(model_param.data)
+                main_data.append(main_param.data)
+        return model_data, main_data
    def _copy_model_grads_to_main_grads(self):
@@ -338,43 +636,6 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
                    if not self.use_contiguous_buffers_in_local_ddp:
                        model_param.main_grad = None
-    def _unscale_main_grads_and_check_for_nan(self):
-        main_grads = []
-        # fp32 params fromm float16 ones.
-        for main_group in self.fp32_from_float16_groups:
-            for main_param in main_group:
-                if main_param.grad is not None:
-                    main_grads.append(main_param.grad.data)
-        # Append fp32 parameters.
-        for main_group in self.fp32_from_fp32_groups:
-            for main_param in main_group:
-                if main_param.grad is not None:
-                    main_grads.append(main_param.grad.data)
-        # Reset found inf.
-        self.found_inf.fill_(0.0)
-        # Unscale and set found inf/nan
-        torch._amp_foreach_non_finite_check_and_unscale_(
-            main_grads, self.found_inf, self.grad_scaler.inv_scale)
-        # Update across all model parallel instances.
-        torch.distributed.all_reduce(self.found_inf,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=mpu.get_model_parallel_group())
-        # Check for nan.
-        found_inf_flag = (self.found_inf.item() > 0)
-        return found_inf_flag
-    def _get_model_and_main_params_data_float16(self):
-        model_data = []
-        main_data = []
-        for model_group, main_group in zip(self.float16_groups,
-                                           self.fp32_from_float16_groups):
-            for model_param, main_param in zip(model_group, main_group):
-                model_data.append(model_param.data)
-                main_data.append(main_param.data)
-        return model_data, main_data
    def _copy_main_params_to_model_params(self):
        # Only needed for the float16 params.
@@ -390,60 +651,6 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
                                        overflow_buf=self._dummy_overflow_buf)
-    def reload_model_params(self):
-        self._copy_model_params_to_main_params()
-    @torch.no_grad()
-    def step(self):
-        timers = get_timers()
-        # Copy gradients from model params to main params.
-        timers('optimizer-copy-to-main-grad').start()
-        self._copy_model_grads_to_main_grads()
-        timers('optimizer-copy-to-main-grad').stop()
-        # Do unscale, check for inf, and update grad scaler only for
-        # the case that grad scaler is provided.
-        if self.grad_scaler:
-            # Unscale and check for inf/nan.
-            timers('optimizer-unscale-and-check-inf').start()
-            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
-            timers('optimizer-unscale-and-check-inf').stop()
-            # We are done with scaling gradients
-            # so we can update the loss scale.
-            self.grad_scaler.update(found_inf_flag)
-            # If we found inf/nan, skip the update.
-            if found_inf_flag:
-                return False, None, None
-        # Clip the main gradients.
-        timers('optimizer-clip-main-grad').start()
-        grad_norm = None
-        if self.clip_grad > 0.0:
-            grad_norm = self.clip_grad_norm(self.clip_grad)
-        timers('optimizer-clip-main-grad').stop()
-        # count the zeros in the grads
-        num_zeros_in_grad = self.count_zeros() if \
-                            self.log_num_zeros_in_grad else None
-        # Step the optimizer.
-        self.optimizer.step()
-        # Update params from main params.
-        timers('optimizer-copy-main-to-model-params').start()
-        self._copy_main_params_to_model_params()
-        timers('optimizer-copy-main-to-model-params').stop()
-        # Successful update.
-        return True, grad_norm, num_zeros_in_grad
    def state_dict(self):
        state_dict = {}
        state_dict['optimizer'] = self.optimizer.state_dict()
@@ -485,17 +692,18 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
                current_param.data.copy_(saved_param.data)
 class FP32Optimizer(MegatronOptimizer):
    def __init__(self, optimizer, clip_grad,
                 log_num_zeros_in_grad,
                 params_have_main_grad,
-                 use_contiguous_buffers_in_local_ddp):
+                 use_contiguous_buffers_in_local_ddp,
+                 models):
        super(FP32Optimizer, self).__init__(
            optimizer, clip_grad, log_num_zeros_in_grad,
-            params_have_main_grad, use_contiguous_buffers_in_local_ddp)
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            models)
        self._scale = torch.cuda.FloatTensor([1.0])
@@ -512,11 +720,12 @@ class FP32Optimizer(MegatronOptimizer):
    @torch.no_grad()
-    def step(self):
+    def step(self, args, timers):
        """Clip gradients (if needed) and step the base optimizer.
        Always return successful since there is no overflow."""
        # Copy main_grads to grads.
+        timers('optimizer-copy-to-main-grad').start()
        if self.params_have_main_grad:
            for param_group in self.optimizer.param_groups:
                for param in param_group['params']:
@@ -527,18 +736,25 @@ class FP32Optimizer(MegatronOptimizer):
                    # persist and therefore should not be deallocated.)
                    if not self.use_contiguous_buffers_in_local_ddp:
                        param.main_grad = None
+        timers('optimizer-copy-to-main-grad').stop()
        # Clip gradients.
+        timers('optimizer-clip-main-grad').start()
        grad_norm = None
        if self.clip_grad > 0.0:
            grad_norm = self.clip_grad_norm(self.clip_grad)
+        timers('optimizer-clip-main-grad').stop()
        # count the zeros in the grads
+        timers('optimizer-count-zeros').start()
        num_zeros_in_grad = self.count_zeros() if \
                            self.log_num_zeros_in_grad else None
+        timers('optimizer-count-zeros').stop()
        # Update parameters.
+        timers('optimizer-inner-step').start()
        self.optimizer.step()
+        timers('optimizer-inner-step').stop()
        # No overflow for FP32 optimizer.
        return True, grad_norm, num_zeros_in_grad

--- a/megatron/p2p_communication.py
+++ b/megatron/p2p_communication.py
@@ -61,7 +61,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
        tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
    override_scatter_gather_tensors_in_pipeline = False
-    if args.scatter_gather_tensors_in_pipeline:
+    if args.scatter_gather_tensors_in_pipeline and \
+            not args.sequence_parallel:
        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
        if tensor_chunk_shape % mpu.get_tensor_model_parallel_world_size() == 0:
            tensor_chunk_shape = tensor_chunk_shape // \
@@ -93,7 +94,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
    # Split tensor into smaller chunks if using scatter-gather optimization.
    if not override_scatter_gather_tensors_in_pipeline and \
-            args.scatter_gather_tensors_in_pipeline:
+            args.scatter_gather_tensors_in_pipeline and \
+            not args.sequence_parallel:
        if tensor_send_next is not None:
            tensor_send_next = mpu.split_tensor_into_1d_equal_chunks(tensor_send_next)
@@ -138,7 +140,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
    # If using scatter-gather optimization, gather smaller chunks.
    if not override_scatter_gather_tensors_in_pipeline and \
-            args.scatter_gather_tensors_in_pipeline:
+            args.scatter_gather_tensors_in_pipeline and \
+            not args.sequence_parallel:
        if recv_prev:
            tensor_recv_prev = mpu.gather_split_1d_tensor(
                tensor_recv_prev).view(tensor_shape).requires_grad_()

--- a/megatron/schedules.py
+++ b/megatron/schedules.py
@@ -279,8 +279,12 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
    pipeline_parallel_rank = mpu.get_pipeline_model_parallel_rank()
    args = get_args()
-    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+    tensor_shape = (seq_length, args.micro_batch_size, args.hidden_size)
    # Compute number of warmup and remaining microbatches.
    num_model_chunks = len(model)
    num_microbatches = get_num_microbatches() * num_model_chunks
@@ -514,18 +518,25 @@ def get_tensor_shapes(rank, model_type):
    # Otherwise, send one tensor (pre-transpose).
    args = get_args()
    tensor_shapes = []
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
    if model_type == ModelType.encoder_and_decoder:
+        if args.sequence_parallel:
+            decoder_seq_length = args.decoder_seq_length // mpu.get_tensor_model_parallel_world_size()
+        else:
+            decoder_seq_length = args.decoder_seq_length
        if mpu.is_pipeline_stage_before_split(rank):
-            # If next rank is after split, then need transpose for encoder_hidden_state.
+            tensor_shapes.append((seq_length, args.micro_batch_size, args.hidden_size))
-            if mpu.is_pipeline_stage_before_split(rank+1):
-                tensor_shapes.append((args.seq_length, args.micro_batch_size, args.hidden_size))
-            else:
-                tensor_shapes.append((args.micro_batch_size, args.seq_length, args.hidden_size))
        else:
-            tensor_shapes.append((args.decoder_seq_length, args.micro_batch_size, args.hidden_size))
+            tensor_shapes.append((decoder_seq_length, args.micro_batch_size, args.hidden_size))
-            tensor_shapes.append((args.micro_batch_size, args.seq_length, args.hidden_size))
+            tensor_shapes.append((seq_length, args.micro_batch_size, args.hidden_size))
    else:
-        tensor_shapes.append((args.seq_length, args.micro_batch_size, args.hidden_size))
+        tensor_shapes.append((seq_length, args.micro_batch_size, args.hidden_size))
    return tensor_shapes

--- a/megatron/text_generation/__init__.py
+++ b/megatron/text_generation/__init__.py
@@ -16,4 +16,5 @@
 from .api import (
    generate,
-    generate_and_post_process)
+    generate_and_post_process,
+    beam_search_and_post_process)
--- a/megatron/text_generation/api.py
+++ b/megatron/text_generation/api.py
@@ -22,7 +22,8 @@ from megatron import mpu
 from .communication import broadcast_float_list
 from .generation import (
        generate_tokens_probs_and_return_on_first_stage,
-        score_and_return_on_first_stage)
+        score_and_return_on_first_stage,
+        beam_search_and_return_on_first_stage)
 from .tokenization import (
    tokenize_prompts,
    detokenize_generations)
@@ -138,3 +139,54 @@ def generate(model,
        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
        stop_on_double_eol=stop_on_double_eol,
        stop_on_eol=stop_on_eol)
+def beam_search_and_post_process(model,
+                                 prompts=None,
+                                 tokens_to_generate=0,
+                                 beam_size=0,
+                                 add_BOS=False,
+                                 stop_token=50256,
+                                 num_return_gen=1,
+                                 length_penalty=1):
+    """Run beam search and post-process outputs, i.e., detokenize,
+    move to cpu and convert to list."""
+    # Main inference.
+    tokens, scores = beam_search(model,
+                                 prompts=prompts,
+                                 tokens_to_generate=tokens_to_generate,
+                                 beam_size=beam_size,
+                                 add_BOS=add_BOS,
+                                 stop_token=stop_token,
+                                 num_return_gen=num_return_gen,
+                                 length_penalty=length_penalty)
+    # Only post-process on first stage.
+    if mpu.is_pipeline_first_stage():
+        lengths = tokens.size(1)*torch.ones(beam_size, dtype=torch.int64, device=torch.cuda.current_device()) 
+        tokens, prompts_plus_generations, prompts_plus_generations_segments = detokenize_generations(tokens, lengths, True)
+        scores = scores.cpu().numpy().tolist()
+        return prompts_plus_generations, prompts_plus_generations_segments, scores
+    return None
+def beam_search(model, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=False, stop_token=50256, num_return_gen=1, length_penalty=1):
+    # Make sure input params are avaialble to all ranks.
+    values = [tokens_to_generate,
+              beam_size,
+              add_BOS,
+              stop_token,
+              num_return_gen,
+              length_penalty]
+    values_float_tensor = broadcast_float_list(6, float_list=values)
+    tokens_to_generate = int(values_float_tensor[0].item())
+    beam_size = int(values_float_tensor[1].item())
+    add_BOS = bool(values_float_tensor[2].item())
+    stop_token = int(values_float_tensor[3].item())
+    num_return_gen = int(values_float_tensor[4].item())
+    length_penalty = values_float_tensor[5].item()
+    context_tokens_tensor, context_length_tensor = tokenize_prompts(
+        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
+    return beam_search_and_return_on_first_stage(model, context_tokens_tensor, context_length_tensor, 
+            beam_size, stop_token=stop_token, num_return_gen=num_return_gen, length_penalty=length_penalty)
--- a/megatron/text_generation/beam_utils.py
+++ b/megatron/text_generation/beam_utils.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+## from huggingface beam search
+class BeamHypotheses(object):
+    def __init__(self, num_beams, length_penalty=1.0, early_stopping=False):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+    def add(self, hyp, sum_logprobs, length):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / length ** self.length_penalty
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp))
+            if len(self) > self.num_beams:
+                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
+                del self.beams[sorted_scores[0][1]]
+                self.worst_score = sorted_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+    def is_done(self, best_sum_logprobs, cur_len):
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated
+        can become better than the worst one in the heap, then we are done with this sentence.
+        """
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
--- a/megatron/text_generation/forward_step.py
+++ b/megatron/text_generation/forward_step.py
@@ -42,7 +42,18 @@ class InferenceParams:
        self.batch_size_offset = 0
        self.key_value_memory_dict = {}
+    def swap_key_value_dict(self, batch_idx):
+        "swap between batches"
+        if len(self.key_value_memory_dict) == 0:
+            raise ValueError("should not swap when dict in empty")
+        for layer_number in self.key_value_memory_dict.keys():
+            inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
+            assert len(batch_idx) == inference_key_memory.shape[1] ## make sure batch size is the same
+            new_inference_key_memory = inference_key_memory[:, batch_idx]
+            new_inference_value_memory = inference_value_memory[:, batch_idx]
+            self.key_value_memory_dict[layer_number] = (
+                    new_inference_key_memory, new_inference_value_memory)
 class ForwardStep:
    """Forward step function with all the communications.