Merge branch 'sequence_parallel' into 'main'

Sequence parallelism + attention checkpoint See merge request ADLR/megatron-lm!413

Merge branch 'sequence_parallel' into 'main'
Sequence parallelism + attention checkpoint See merge request ADLR/megatron-lm!413
9aad9203 · Jared Casper · 667c2bcb · 3f91f09b · 9aad9203 · 9aad9203
Commit 9aad9203 authored May 20, 2022 by Jared Casper
19 changed files
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -103,14 +103,20 @@ def parse_args(extra_args_provider=None, defaults={},
    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
        'longer valid, use --tensor-model-parallel-size instead'
    del args.model_parallel_size
    if args.checkpoint_activations:
-        args.activations_checkpoint_method = 'uniform'
+        args.recompute_granularity = 'full'
+        args.recompute_method = 'uniform'
        if args.rank == 0:
            print('--checkpoint-activations is no longer valid, '
-                  'use --activation-checkpoint-method instead. '
+                  'use --recompute-granularity and --recompute-method  instead. '
-                  'Defaulting to activation-checkpoint-method=uniform.')
+                  'Defaulting to recompute-granularity=full and recompute-method=uniform.')
    del args.checkpoint_activations
+    if args.recompute_activations:
+        args.recompute_granularity = 'selective'
+    del args.recompute_activations
    # Set input defaults.
    for key in defaults:
        # For default to be valid, it should not be provided in the
@@ -278,19 +284,32 @@ def parse_args(extra_args_provider=None, defaults={},
                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
                  'Defaulting to no_persist_layer_norm=True')
-    # Activation checkpointing.
+    # Activation recomputing.
-    if args.distribute_checkpointed_activations:
+    if args.distribute_saved_activations:
        assert args.tensor_model_parallel_size > 1, 'can distribute ' \
-            'checkpointed activations only across tensor model ' \
+            'recomputed activations only across tensor model ' \
            'parallel groups'
-        assert args.activations_checkpoint_method is not None, \
+        assert args.recompute_granularity == 'full', \
-            'for distributed checkpoint activations to work you '\
+            'distributed recompute activations is only '\
-            'need to use a activation-checkpoint method '
+            'application to full recompute granularity'
+        assert args.recompute_method is not None, \
+            'for distributed recompute activations to work you '\
+            'need to use a recompute method '
        assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 10, \
-            'distributed checkpoint activations are supported for pytorch ' \
+            'distributed recompute activations are supported for pytorch ' \
            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
            'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR)
+    if args.recompute_granularity == 'selective':
+        assert args.recompute_method is None, \
+            'recompute method is not yet supported for ' \
+            'selective recomputing granularity'
+    # disable async_tensor_model_parallel_allreduce when
+    # model parallel memory optimization is enabled
+    if args.sequence_parallel:
+        args.async_tensor_model_parallel_allreduce = False
    _print_args(args)
    return args
@@ -471,27 +490,40 @@ def _add_training_args(parser):
                       ' (1024 - 16) / 8 = 126 intervals will increase'
                       'the batch size linearly to 1024. In each interval'
                       'we will use approximately 300000 / 126 = 2380 samples.')
-    group.add_argument('--checkpoint-activations', action='store_true',
+    group.add_argument('--recompute-activations', action='store_true',
-                       help='Checkpoint activation to allow for training '
+                       help='recompute activation to allow for training '
                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--distribute-checkpointed-activations',
+    group.add_argument('--recompute-granularity', type=str, default=None,
+                       choices=['full', 'selective'],
+                       help='Checkpoint activations to allow for training '
+                       'with larger models, sequences, and batch sizes. '
+                       'It is supported at two granularities 1) full: '
+                       'whole transformer layer is recomputed, '
+                       '2) selective: core attention part of the transformer '
+                       'layer is recomputed.')
+    group.add_argument('--distribute-saved-activations',
                       action='store_true',
-                       help='If set, distribute checkpointed activations '
+                       help='If set, distribute recomputed activations '
                       'across model parallel group.')
-    group.add_argument('--activations-checkpoint-method', type=str, default=None,
+    group.add_argument('--recompute-method', type=str, default=None,
                       choices=['uniform', 'block'],
                       help='1) uniform: uniformly divide the total number of '
-                       'Transformer layers and checkpoint the input activation of '
+                       'Transformer layers and recompute the input activation of '
-                       'each divided chunk, '
+                       'each divided chunk at specified granularity, '
-                       '2) checkpoint the input activations of only a set number of '
+                       '2) recompute the input activations of only a set number of '
                       'individual Transformer layers per pipeline stage and do the '
-                       'rest without any checkpointing'
+                       'rest without any recomputing at specified granularity'
-                       'default) do not apply activations checkpoint to any layers')
+                       'default) do not apply activations recompute to any layers')
-    group.add_argument('--activations-checkpoint-num-layers', type=int, default=1,
+    group.add_argument('--recompute-num-layers', type=int, default=1,
                       help='1) uniform: the number of Transformer layers in each '
-                       'uniformly divided checkpoint unit, '
+                       'uniformly divided recompute unit, '
                       '2) block: the number of individual Transformer layers '
-                       'to checkpoint within each pipeline stage.')
+                       'to recompute within each pipeline stage.')
+    # deprecated
+    group.add_argument('--checkpoint-activations', action='store_true',
+                       help='Checkpoint activation to allow for training '
+                       'with larger models, sequences, and batch sizes.')
    group.add_argument('--train-iters', type=int, default=None,
                       help='Total number of iterations to train over all '
                       'training runs. Note that either train-iters or '
@@ -540,6 +572,8 @@ def _add_training_args(parser):
                       'This kernel supports only a set of hidden sizes. Please '
                       'check persist_ln_hidden_sizes if your hidden '
                       'size is supported.')
+    group.add_argument('--sequence-parallel', action='store_true',
+                       help='Enable sequence parallel optimization.')
    group.add_argument('--no-gradient-accumulation-fusion',
                       action='store_false',
                       help='Disable fusing gradient accumulation to weight '

--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -278,9 +278,13 @@ def _warmup_jit_function():
    del bias, input, output
    # Warmup fused bias+dropout+add
-    input = torch.rand((args.seq_length, args.micro_batch_size, args.hidden_size),
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+    input = torch.rand((seq_length, args.micro_batch_size, args.hidden_size),
                       dtype=dtype, device='cuda')
-    residual = torch.rand((args.seq_length, args.micro_batch_size, args.hidden_size),
+    residual = torch.rand((seq_length, args.micro_batch_size, args.hidden_size),
                          dtype=dtype, device='cuda')
    bias = torch.rand((args.hidden_size), dtype=dtype, device='cuda').expand_as(residual)
    dropout_rate = 0.1

--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -78,7 +78,12 @@ class BertLMHead(MegatronModule):
        self.parallel_output = parallel_output
        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
-        self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+        setattr(self.dense.weight, 'sequence_parallel', args.sequence_parallel)
+        setattr(self.dense.bias, 'sequence_parallel', args.sequence_parallel)
+        self.layernorm = LayerNorm(hidden_size,
+                                   eps=layernorm_epsilon,
+                                   sequence_parallel=args.sequence_parallel)
        self.gelu = torch.nn.functional.gelu
        if args.openai_gelu:
            self.gelu = openai_gelu
@@ -110,14 +115,20 @@ def post_language_model_processing(lm_output, pooled_output,
        binary_logits = binary_head(pooled_output)
    if lm_labels is None:
-        return lm_logits, binary_logits
+        # [s b h] => [b s h]
+        return lm_logits.transpose(0,1).contiguous(), binary_logits
    else:
+        # [b s] => [s b]
+        lm_labels = lm_labels.transpose(0,1).contiguous()
+        # lm_logits : [s, b, h] and lm_labels: [s, b]
        if fp16_lm_cross_entropy:
            assert lm_logits.dtype == torch.half
            lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
        else:
            lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
                                                       lm_labels)
+        # [s, b] => [b s]
+        lm_loss = lm_loss.transpose(0,1).contiguous()
        return lm_loss, binary_logits

--- a/megatron/model/biencoder_model.py
+++ b/megatron/model/biencoder_model.py
@@ -291,7 +291,7 @@ class PretrainedBertModel(MegatronModule):
        pool_mask = (input_ids == self.pad_id).unsqueeze(2)
        # Taking the representation of the [CLS] token of BERT
-        pooled_output = lm_output[:, 0, :]
+        pooled_output = lm_output[0, :, :]
        # Converting to float16 dtype
        pooled_output = pooled_output.to(lm_output.dtype)

--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -69,7 +69,9 @@ class FusedLayerNormAffineFunction(torch.autograd.Function):
 class MixedFusedLayerNorm(torch.nn.Module):
-  def __init__(self, normalized_shape, eps=1e-5, no_persist_layer_norm=True):
+  def __init__(self, normalized_shape, eps=1e-5,
+               no_persist_layer_norm=True,
+               sequence_parallel=False):
        super(MixedFusedLayerNorm, self).__init__()
        global fused_mix_prec_layer_norm_cuda
@@ -94,6 +96,11 @@ class MixedFusedLayerNorm(torch.nn.Module):
        self.bias = Parameter(torch.Tensor(*normalized_shape))
        self.reset_parameters()
        self.no_persist_layer_norm = no_persist_layer_norm
+        self.sequence_parallel = sequence_parallel
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
+        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
  def reset_parameters(self):

--- a/megatron/model/gpt_model.py
+++ b/megatron/model/gpt_model.py
@@ -32,20 +32,26 @@ def post_language_model_processing(lm_output, labels, logit_weights,
                                   parallel_output,
                                   fp16_lm_cross_entropy):
-    # Output.
+    # Output. Format [s b h]
    output = parallel_lm_logits(
        lm_output,
        logit_weights,
        parallel_output)
    if labels is None:
-        return output
+        # [s b h] => [b s h]
+        return output.transpose(0,1).contiguous()
    else:
+        # [b s] => [s b]
+        labels = labels.transpose(0,1).contiguous()
        if fp16_lm_cross_entropy:
            assert output.dtype == torch.half
            loss = mpu.vocab_parallel_cross_entropy(output, labels)
        else:
            loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
+        # [s b] => [b, s]
+        loss = loss.transpose(0,1).contiguous()
        return loss

--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -26,23 +26,29 @@ from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal, scaled_init_method_normal
 def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
                       bias=None):
    """LM logits using word embedding weights."""
    args = get_args()
    # Parallel logits.
-    if args.async_tensor_model_parallel_allreduce:
+    if args.async_tensor_model_parallel_allreduce or\
+            args.sequence_parallel:
        input_parallel = input_
-        async_grad_allreduce = mpu.get_tensor_model_parallel_world_size() > 1
+        model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+        async_grad_allreduce = args.async_tensor_model_parallel_allreduce and \
+            model_parallel and not args.sequence_parallel
    else:
        input_parallel = mpu.copy_to_tensor_model_parallel_region(input_)
        async_grad_allreduce = False
    # Matrix multiply.
-    logits_parallel = mpu.LinearWithGradAccumulationAndAsyncAllreduce.apply(
+    logits_parallel = mpu.LinearWithGradAccumulationAndAsyncCommunication.apply(
-            input_parallel, word_embeddings_weight, bias,
+        input_parallel, word_embeddings_weight, bias,
-            args.gradient_accumulation_fusion,
+        args.gradient_accumulation_fusion,
-            async_grad_allreduce)
+        async_grad_allreduce, args.sequence_parallel)
    # Gather if needed.
    if parallel_output:
        return logits_parallel
@@ -98,12 +104,21 @@ class Pooler(MegatronModule):
    def __init__(self, hidden_size, init_method):
        super(Pooler, self).__init__()
+        args = get_args()
        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+        self.sequence_parallel = args.sequence_parallel
    def forward(self, hidden_states, sequence_index=0):
-        # hidden_states: [b, s, h]
+        # hidden_states: [s, b, h]
        # sequence_index: index of the token to pool.
-        pooled = hidden_states[:, sequence_index, :]
+        # gather data along sequence dimensions
+        # same pooler is run on all tensor parallel nodes
+        if self.sequence_parallel:
+            hidden_states = mpu.gather_from_sequence_parallel_region(hidden_states)
+        pooled = hidden_states[sequence_index, :, :]
        pooled = self.dense(pooled)
        pooled = torch.tanh(pooled)
        return pooled
@@ -164,6 +179,8 @@ class Embedding(MegatronModule):
        else:
            self.tokentype_embeddings = None
+        self.fp32_residual_connection = args.fp32_residual_connection 
+        self.sequence_parallel = args.sequence_parallel
        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
@@ -205,8 +222,20 @@ class Embedding(MegatronModule):
        else:
            assert self.tokentype_embeddings is None
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
        # Dropout.
-        embeddings = self.embedding_dropout(embeddings)
+        if self.sequence_parallel:
+            embeddings = mpu.scatter_to_sequence_parallel_region(embeddings)
+            with mpu.get_cuda_rng_tracker().fork():
+                embeddings = self.embedding_dropout(embeddings)
+        else:
+            embeddings = self.embedding_dropout(embeddings)
        return embeddings

--- a/megatron/model/t5_model.py
+++ b/megatron/model/t5_model.py
@@ -152,19 +152,24 @@ class T5Model(MegatronModule):
        if self.post_process and self.add_decoder:
            decoder_output, encoder_output = lm_output
-            # Output.
+            # Output. [s, b, h]
            lm_logits = self.lm_head(decoder_output,
                                     self.word_embeddings_weight())
            if lm_labels is None:
-                return lm_logits
+                # [s b h] => [b s h]
+                return lm_logits.transpose(0,1).contiguous()
            else:
+                # [b s] => [s b]
+                lm_labels = lm_labels.transpose(0,1).contiguous()
                if self.fp16_lm_cross_entropy:
                    assert lm_logits.dtype == torch.half
                    lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
                else:
                    lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
                                                               lm_labels)
+                # [s b] => [b s]
+                lm_loss = lm_loss.transpose(0,1).contiguous()
            return lm_loss
        elif self.add_decoder and not self.add_encoder:
            decoder_output, encoder_output = lm_output

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -19,7 +19,7 @@ from contextlib import nullcontext
 import torch
 import torch.nn.functional as F
-from megatron import get_args
+from megatron import get_timers, get_args
 from megatron import mpu
 from .module import MegatronModule
 from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType
@@ -28,6 +28,7 @@ from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl
 from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
 """ We use the following notation throughout this file:
     h: hidden size
     n: number of attention heads
@@ -43,7 +44,6 @@ from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
        hyperparameters: transformer hyperparameters
 """
 class DropPath(MegatronModule):
    """Drop paths (Stochastic Depth) per sample 
    (when applied in main path of residual blocks).
@@ -130,21 +130,21 @@ class SwitchMLP(MegatronModule):
            self.experts.append(ParallelMLP(init_method, output_layer_init_method))
    def forward(self, hidden_states):
-        # hidden_states: [b, s, h]
+        # hidden_states: [s, b, h]
-        b = hidden_states.size(0)
+        s = hidden_states.size(0)
-        s = hidden_states.size(1)
+        b = hidden_states.size(1)
        h = hidden_states.size(2)
        route = self.router(hidden_states)
        route = torch.nn.functional.softmax(route, dim=2)
        max_prob, max_ind = torch.max(route, dim=2)
-        max_prob = torch.unsqueeze(max_prob, 2) # [b s 1]
+        max_prob = torch.unsqueeze(max_prob, 2) # [s b 1]
        # TODO (rprenger) TODO this could be made easier to read
-        # Converting [b, s, h] to [b*s, h].
+        # Converting [s, b, h] to [s*b, h].
        # Each vector could be routed differently
-        hidden_states = hidden_states.view(-1, hidden_states.size(2)) # [b*s h]
+        hidden_states = hidden_states.view(-1, hidden_states.size(2)) # [s*b h]
-        max_prob = max_prob.view(-1, max_prob.size(2)) # [b*s 1]
+        max_prob = max_prob.view(-1, max_prob.size(2)) # [s*b 1]
-        max_ind = max_ind.view(-1) # [b*s]
+        max_ind = max_ind.view(-1) # [s*b]
        output_total = torch.empty_like(hidden_states)
        output_bias_total = torch.empty_like(hidden_states)
@@ -160,15 +160,156 @@ class SwitchMLP(MegatronModule):
        output_total = output_total*max_prob
        output_bias_total = output_bias_total*max_prob
-        output_total = output_total.view(b, s, h)
+        output_total = output_total.view(s, b, h)
-        output_bias_total = output_bias_total.view(b, s, h)
+        output_bias_total = output_bias_total.view(s, b, h)
        return output_total, output_bias_total
+class CoreAttention(MegatronModule):
+    def __init__(self, layer_number,
+                 attn_mask_type=AttnMaskType.padding):
+        super(CoreAttention, self).__init__()
+        args = get_args()
+        self.fp16 = args.fp16
+        self.bf16 = args.bf16
+        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.attn_mask_type = attn_mask_type
+        self.sequence_parallel = args.sequence_parallel
+        projection_size = args.kv_channels * args.num_attention_heads
+        # Per attention head and per partition values.
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(projection_size,
+                                                    world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            projection_size, args.num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
+            args.num_attention_heads, world_size)
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            self.fp16, self.bf16,
+            self.attn_mask_type,
+            args.masked_softmax_fusion,
+            attention_mask_func,
+            self.attention_softmax_in_fp32,
+            coeff)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
+    def forward(self, query_layer, key_layer,
+                value_layer, attention_mask):
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1),
+                       query_layer.size(2),
+                       query_layer.size(0),
+                       key_layer.size(0))
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(output_size[2],
+                                       output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = torch.empty(
+            output_size[0]*output_size[1],
+            output_size[2],
+            output_size[3],
+            dtype=query_layer.dtype,
+            device=torch.cuda.current_device())
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer.transpose(0, 1),   # [b * np, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0, alpha=(1.0/self.norm_factor))
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        if not self.sequence_parallel:
+            with mpu.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1),
+                       value_layer.size(2),
+                       query_layer.size(0),
+                       value_layer.size(3))
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(value_layer.size(0),
+                                       output_size[0] * output_size[1], -1)
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
 class ParallelAttention(MegatronModule):
    """Parallel self-attention layer abstract class.
-    Self-attention layer takes input with size [b, s, h]
+    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    """
@@ -178,13 +319,6 @@ class ParallelAttention(MegatronModule):
                 attn_mask_type=AttnMaskType.padding):
        super(ParallelAttention, self).__init__()
        args = get_args()
-        self.fp16 = args.fp16
-        self.bf16 = args.bf16
-        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
        self.layer_number = max(1, layer_number)
        self.attention_type = attention_type
        self.attn_mask_type = attn_mask_type
@@ -194,8 +328,6 @@ class ParallelAttention(MegatronModule):
        # Per attention head and per partition values.
        world_size = mpu.get_tensor_model_parallel_world_size()
-        self.hidden_size_per_partition = mpu.divide(projection_size,
-                                                    world_size)
        self.hidden_size_per_attention_head = mpu.divide(
            projection_size, args.num_attention_heads)
        self.num_attention_heads_per_partition = mpu.divide(
@@ -222,24 +354,9 @@ class ParallelAttention(MegatronModule):
                gather_output=False,
                init_method=init_method)
-        coeff = None
+        self.core_attention = CoreAttention(self.layer_number,
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+                                            self.attn_mask_type)
-        if self.apply_query_key_layer_scaling:
+        self.checkpoint_core_attention = args.recompute_granularity == 'selective'
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-        self.scale_mask_softmax = FusedScaleMaskSoftmax(
-            self.fp16, self.bf16,
-            self.attn_mask_type,
-            args.masked_softmax_fusion,
-            attention_mask_func,
-            self.attention_softmax_in_fp32,
-            coeff)
-        # Dropout. Note that for a single iteration, this layer will generate
-        # different outputs on different number of parallel partitions but
-        # on average it should not be partition dependent.
-        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
        # Output.
        self.dense = mpu.RowParallelLinear(
@@ -249,6 +366,23 @@ class ParallelAttention(MegatronModule):
            init_method=output_layer_init_method,
            skip_bias_add=True)
+    def _checkpointed_attention_forward(self, query_layer, key_layer,
+                                        value_layer, attention_mask):
+        """Forward method with activation checkpointing."""
+        def custom_forward(*inputs):
+            query_layer = inputs[0]
+            key_layer = inputs[1]
+            value_layer = inputs[2]
+            attention_mask = inputs[3]
+            output_ = self.core_attention(query_layer, key_layer,
+                                          value_layer, attention_mask)
+            return output_
+        hidden_states = mpu.checkpoint(
+            custom_forward,
+            False, query_layer, key_layer, value_layer, attention_mask)
+        return hidden_states
    def _allocate_memory(self, inference_max_sequence_len, batch_size):
        return torch.empty(
@@ -258,13 +392,11 @@ class ParallelAttention(MegatronModule):
            self.hidden_size_per_attention_head,
            dtype=self.params_dtype,
            device=torch.cuda.current_device())
    def forward(self, hidden_states, attention_mask,
                encoder_output=None, inference_params=None):
        # hidden_states: [sq, b, h]
        # =================================================
        # Pre-allocate memory for key-values for inference.
        # =================================================
@@ -282,7 +414,6 @@ class ParallelAttention(MegatronModule):
                inference_key_memory, inference_value_memory = \
                    inference_params.key_value_memory_dict[self.layer_number]
        # =====================
        # Query, Key, and Value
        # =====================
@@ -323,7 +454,6 @@ class ParallelAttention(MegatronModule):
                 self.hidden_size_per_attention_head)
            query_layer = query_layer.view(*new_tensor_shape)
        # ==================================
        # Adjust key and value for inference
        # ==================================
@@ -345,90 +475,16 @@ class ParallelAttention(MegatronModule):
            value_layer = inference_value_memory[
                :sequence_end, batch_start:batch_end, ...]
+        # ==================================
+        # core attention computation
+        # ==================================
-        # ===================================
+        if self.checkpoint_core_attention:
-        # Raw attention scores. [b, np, s, s]
+            context_layer = self._checkpointed_attention_forward(
-        # ===================================
+                query_layer, key_layer, value_layer, attention_mask)
+        else:
-        # [b, np, sq, sk]
+            context_layer = self.core_attention(
-        output_size = (query_layer.size(1),
+                query_layer, key_layer, value_layer, attention_mask)
-                       query_layer.size(2),
-                       query_layer.size(0),
-                       key_layer.size(0))
-        # [sq, b, np, hn] -> [sq, b * np, hn]
-        query_layer = query_layer.view(output_size[2],
-                                       output_size[0] * output_size[1], -1)
-        # [sk, b, np, hn] -> [sk, b * np, hn]
-        key_layer = key_layer.view(output_size[3],
-                                   output_size[0] * output_size[1], -1)
-        # preallocting result tensor: [b * np, sq, sk]
-        matmul_result = torch.empty(
-            output_size[0]*output_size[1],
-            output_size[2],
-            output_size[3],
-            dtype=query_layer.dtype,
-            device=torch.cuda.current_device())
-        # Raw attention scores. [b * np, sq, sk]
-        matmul_result = torch.baddbmm(
-            matmul_result,
-            query_layer.transpose(0, 1),   # [b * np, sq, hn]
-            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-            beta=0.0, alpha=(1.0/self.norm_factor))
-        # change view to [b, np, sq, sk]
-        attention_scores = matmul_result.view(*output_size)
-        # ===========================
-        # Attention probs and dropout
-        # ===========================
-        # attention scores and attention mask [b, np, sq, sk]
-        attention_probs = self.scale_mask_softmax(attention_scores,
-                                                  attention_mask)
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        with mpu.get_cuda_rng_tracker().fork():
-            attention_probs = self.attention_dropout(attention_probs)
-        # =========================
-        # Context layer. [sq, b, hp]
-        # =========================
-        # value_layer -> context layer.
-        # [sk, b, np, hn] --> [b, np, sq, hn]
-        # context layer shape: [b, np, sq, hn]
-        output_size = (value_layer.size(1),
-                       value_layer.size(2),
-                       query_layer.size(0),
-                       value_layer.size(3))
-        # change view [sk, b * np, hn]
-        value_layer = value_layer.view(value_layer.size(0),
-                                       output_size[0] * output_size[1], -1)
-        # change view [b * np, sq, sk]
-        attention_probs = attention_probs.view(output_size[0] * output_size[1],
-                                               output_size[2], -1)
-        # matmul: [b * np, sq, hn]
-        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-        # change view [b, np, sq, hn]
-        context_layer = context_layer.view(*output_size)
-        # [b, np, sq, hn] --> [sq, b, np, hn]
-        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-        # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_layer_shape = context_layer.size()[:-2] + \
-            (self.hidden_size_per_partition,)
-        context_layer = context_layer.view(*new_context_layer_shape)
        # =================
        # Output. [sq, b, h]
@@ -471,7 +527,7 @@ def bias_dropout_add_fused_inference(x: torch.Tensor,
 class ParallelTransformerLayer(MegatronModule):
    """A single transformer layer.
-    Transformer layer takes input with size [b, s, h] and returns an
+    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    """
@@ -495,7 +551,8 @@ class ParallelTransformerLayer(MegatronModule):
        self.input_layernorm = LayerNorm(
            args.hidden_size,
            eps=args.layernorm_epsilon,
-            no_persist_layer_norm=args.no_persist_layer_norm)
+            no_persist_layer_norm=args.no_persist_layer_norm,
+            sequence_parallel=args.sequence_parallel)
        # Self attention.
        self.self_attention = ParallelAttention(
@@ -512,7 +569,8 @@ class ParallelTransformerLayer(MegatronModule):
        self.post_attention_layernorm = LayerNorm(
            args.hidden_size,
            eps=args.layernorm_epsilon,
-            no_persist_layer_norm=args.no_persist_layer_norm)
+            no_persist_layer_norm=args.no_persist_layer_norm,
+            sequence_parallel=args.sequence_parallel)
        if self.layer_type == LayerType.decoder:
            self.inter_attention = ParallelAttention(
@@ -524,7 +582,8 @@ class ParallelTransformerLayer(MegatronModule):
            self.post_inter_attention_layernorm = LayerNorm(
                args.hidden_size,
                eps=args.layernorm_epsilon,
-                no_persist_layer_norm=args.no_persist_layer_norm)
+                no_persist_layer_norm=args.no_persist_layer_norm,
+                sequence_parallel=args.sequence_parallel)
        # MLP
        if args.num_experts is not None:
@@ -542,7 +601,7 @@ class ParallelTransformerLayer(MegatronModule):
    def forward(self, hidden_states, attention_mask,
                encoder_output=None, enc_dec_attn_mask=None,
                inference_params=None):
-        # hidden_states: [b, s, h]
+        # hidden_states: [s, b, h]
        # Layer norm at the beginning of the transformer layer.
        layernorm_output = self.input_layernorm(hidden_states)
@@ -671,6 +730,8 @@ class ParallelTransformer(MegatronModule):
        super(ParallelTransformer, self).__init__()
        args = get_args()
+        self.layer_type = layer_type
+        self.model_type = args.model_type
        self.bf16 = args.bf16
        self.fp32_residual_connection = args.fp32_residual_connection
        self.post_layer_norm = post_layer_norm
@@ -680,9 +741,13 @@ class ParallelTransformer(MegatronModule):
        self.drop_path_rate = drop_path_rate
        # Store activation checkpoiting flag.
-        self.activations_checkpoint_method = args.activations_checkpoint_method
+        self.recompute_granularity = args.recompute_granularity
-        self.activations_checkpoint_num_layers = args.activations_checkpoint_num_layers
+        self.recompute_method = args.recompute_method
-        self.distribute_checkpointed_activations = args.distribute_checkpointed_activations
+        self.recompute_num_layers = args.recompute_num_layers
+        self.distribute_saved_activations = \
+            args.distribute_saved_activations and not args.sequence_parallel
+        self.sequence_parallel = args.sequence_parallel
        # Number of layers.
        self.num_layers = mpu.get_num_layers(
@@ -751,7 +816,8 @@ class ParallelTransformer(MegatronModule):
            self.final_layernorm = LayerNorm(
                args.hidden_size,
                eps=args.layernorm_epsilon,
-                no_persist_layer_norm=args.no_persist_layer_norm)
+                no_persist_layer_norm=args.no_persist_layer_norm,
+                sequence_parallel=args.sequence_parallel)
    def _get_layer(self, layer_number):
        return self.layers[layer_number]
@@ -771,32 +837,33 @@ class ParallelTransformer(MegatronModule):
                return x_
            return custom_forward
-        if self.activations_checkpoint_method == 'uniform':
+        if self.recompute_method == 'uniform':
            # Uniformly divide the total number of Transformer layers and checkpoint
            # the input activation of each divided chunk.
            # A method to further reduce memory usage reducing checkpoints.
            l = 0
            while l < self.num_layers:
                hidden_states = mpu.checkpoint(
-                    custom(l, l + self.activations_checkpoint_num_layers),
+                    custom(l, l + self.recompute_num_layers),
-                    self.distribute_checkpointed_activations,
+                    self.distribute_saved_activations,
                    hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
-                l += self.activations_checkpoint_num_layers
+                l += self.recompute_num_layers
-        elif self.activations_checkpoint_method == 'block':
+        elif self.recompute_method == 'block':
            # Checkpoint the input activation of only a set number of individual
            # Transformer layers and skip the rest.
            # A method fully use the device memory removing redundant re-computation.
            for l in range(self.num_layers):
-                if l < self.activations_checkpoint_num_layers:
+                if l < self.recompute_num_layers:
                    hidden_states = mpu.checkpoint(
                        custom(l, l + 1),
-                        self.distribute_checkpointed_activations,
+                        self.distribute_saved_activations,
                        hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
                else:
                    hidden_states = custom(l, l + 1)(
                        hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
        else:
-            raise ValueError("Invalid activation checkpoint method.")
+            raise ValueError("Invalid activation recompute method.")
        return hidden_states
@@ -813,21 +880,14 @@ class ParallelTransformer(MegatronModule):
    def forward(self, hidden_states, attention_mask,
                encoder_output=None, enc_dec_attn_mask=None,
                inference_params=None):
+        # hidden_states: [s, b, h]
        # Checks.
        if inference_params:
-            assert self.activations_checkpoint_method is None, \
+            assert self.recompute_granularity is None, \
                'inference does not work with activation checkpointing'
-        if self.pre_process:
+        if not self.pre_process:
-            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-            # If the input flag for fp32 residual connection is set, convert for float.
-            if self.fp32_residual_connection:
-                hidden_states = hidden_states.transpose(0, 1).contiguous().float()
-            # Otherwise, leave it as is.
-            else:
-                hidden_states = hidden_states.transpose(0, 1).contiguous()
-        else:
            # See set_input_tensor()
            hidden_states = self.input_tensor
@@ -848,37 +908,34 @@ class ParallelTransformer(MegatronModule):
        #   is called here to be future-proof and corner-case-proof.
        hidden_states = mpu.make_viewless_tensor(
            hidden_states,
-            requires_grad = True,
+            requires_grad=True,
-            keep_graph = True,
+            keep_graph=True,
        )
-        # Transpose encoder output.
+        if self.sequence_parallel:
-        if encoder_output is not None:
+            rng_context = mpu.get_cuda_rng_tracker().fork()
-            encoder_output = encoder_output.transpose(0, 1).contiguous()
-        # Forward pass.
-        if self.activations_checkpoint_method is not None:
-            hidden_states = self._checkpointed_forward(hidden_states,
-                                                       attention_mask,
-                                                       encoder_output,
-                                                       enc_dec_attn_mask)
        else:
-            for index in range(self.num_layers):
+            rng_context = nullcontext()
-                layer = self._get_layer(index)
-                hidden_states = layer(
+        with rng_context:
-                    hidden_states,
+            # Forward pass.
-                    attention_mask,
+            if self.recompute_granularity == 'full':
-                    encoder_output=encoder_output,
+                hidden_states = self._checkpointed_forward(hidden_states,
-                    enc_dec_attn_mask=enc_dec_attn_mask,
+                                                           attention_mask,
-                    inference_params=inference_params)
+                                                           encoder_output,
+                                                           enc_dec_attn_mask)
+            else:
+                for index in range(self.num_layers):
+                    layer = self._get_layer(index)
+                    hidden_states = layer(
+                        hidden_states,
+                        attention_mask,
+                        encoder_output=encoder_output,
+                        enc_dec_attn_mask=enc_dec_attn_mask,
+                        inference_params=inference_params)
        # Final layer norm.
-        if self.post_process:
+        if self.post_process and self.post_layer_norm:
-            # Reverting data format change [s b h] --> [b s h].
+            hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.transpose(0, 1).contiguous()
-            output = self.final_layernorm(hidden_states) if self.post_layer_norm else hidden_states
-        else:
-            output = hidden_states
-        return output
+        return hidden_states
--- a/megatron/model/vision/vit_backbone.py
+++ b/megatron/model/vision/vit_backbone.py
@@ -21,7 +21,6 @@ import torch
 import apex
 import torch.nn.functional as F
 from megatron import get_args
-from megatron.model import LayerNorm
 from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import (
    get_linear_layer,

--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -49,18 +49,21 @@ from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pi
 from .initialize import initialize_model_parallel
 from .initialize import model_parallel_is_initialized
-from .layers import LinearWithGradAccumulationAndAsyncAllreduce
+from .layers import LinearWithGradAccumulationAndAsyncCommunication
 from .layers import ColumnParallelLinear
 from .layers import RowParallelLinear
 from .layers import VocabParallelEmbedding
 from .layers import (set_tensor_model_parallel_attributes,
                     set_defaults_if_not_set_tensor_model_parallel_attributes,
                     copy_tensor_model_parallel_attributes)
-from .mappings import copy_to_tensor_model_parallel_region
+from .mappings import  copy_to_tensor_model_parallel_region
-from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import  reduce_from_tensor_model_parallel_region
-from .mappings import reduce_from_tensor_model_parallel_region
+from .mappings import  scatter_to_tensor_model_parallel_region
-from .mappings import scatter_to_tensor_model_parallel_region
+from .mappings import  gather_from_tensor_model_parallel_region
+from .mappings import  scatter_to_sequence_parallel_region
+from .mappings import  gather_from_sequence_parallel_region
+from .mappings import  reduce_scatter_to_sequence_parallel_region
 from .random import checkpoint
 from .random import get_cuda_rng_tracker

--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -30,20 +30,21 @@ from .initialize import get_tensor_model_parallel_world_size
 from .initialize import get_tensor_model_parallel_group
 from .mappings import copy_to_tensor_model_parallel_region
 from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import gather_from_sequence_parallel_region
 from .mappings import reduce_from_tensor_model_parallel_region
 from .mappings import scatter_to_tensor_model_parallel_region
+from .mappings import reduce_scatter_to_sequence_parallel_region
 from .random import get_cuda_rng_tracker
 from .utils import divide
 from .utils import split_tensor_along_last_dim
 from .utils import VocabUtility
 from megatron import get_args
 _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
                                      'partition_dim': -1,
                                      'partition_stride': 1}
 def param_is_not_tensor_parallel_duplicate(param):
    return (hasattr(param, 'tensor_model_parallel') and
            param.tensor_model_parallel) or (
@@ -199,19 +200,39 @@ class VocabParallelEmbedding(torch.nn.Module):
        return output
-class LinearWithGradAccumulationAndAsyncAllreduce(torch.autograd.Function):
+class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
    """
-    Linear layer execution with asynchronous all-reduce and gradient accumulation
+    Linear layer execution with asynchronous communication and gradient accumulation
    fusion in backprop.
    """
    @staticmethod
    def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
-                async_grad_allreduce):
+                async_grad_allreduce, sequence_parallel):
        ctx.save_for_backward(input, weight)
        ctx.use_bias = bias is not None
        ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
        ctx.async_grad_allreduce = async_grad_allreduce
-        output = torch.matmul(input, weight.t())
+        ctx.sequence_parallel = sequence_parallel
+        if sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+            all_gather_buffer = \
+                torch.empty(dim_size, dtype=input.dtype,
+                            device=torch.cuda.current_device(),
+                            requires_grad=False)
+            torch.distributed._all_gather_base(
+                all_gather_buffer,
+                input,
+                group=get_tensor_model_parallel_group())
+            total_input = all_gather_buffer
+        else:
+            total_input = input
+        output = torch.matmul(total_input, weight.t())
        if bias is not None:
            output = output + bias
        return output
@@ -220,13 +241,39 @@ class LinearWithGradAccumulationAndAsyncAllreduce(torch.autograd.Function):
    def backward(ctx, grad_output):
        input, weight = ctx.saved_tensors
        use_bias = ctx.use_bias
+        if ctx.sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+            all_gather_buffer = \
+                torch.empty(dim_size, dtype=input.dtype,
+                            device=torch.cuda.current_device(),
+                            requires_grad=False)
+            handle = torch.distributed._all_gather_base(
+                all_gather_buffer,
+                input,
+                group=get_tensor_model_parallel_group(), async_op=True)
+            # Delay the start of intput gradient computation shortly (3us) to have
+            # gather scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
+            total_input = all_gather_buffer
+        else:
+            total_input = input
        grad_input = grad_output.matmul(weight)
+        if ctx.sequence_parallel:
+            handle.wait()
        # Convert the tensor shapes to 2D for execution compatibility
        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
                                       grad_output.shape[2])
-        input = input.view(input.shape[0] * input.shape[1], input.shape[2])
+        total_input = total_input.view(total_input.shape[0] * total_input.shape[1],
+				       total_input.shape[2])
        if ctx.async_grad_allreduce:
            # Asynchronous all-reduce
            handle = torch.distributed.all_reduce(
@@ -234,16 +281,38 @@ class LinearWithGradAccumulationAndAsyncAllreduce(torch.autograd.Function):
            # Delay the start of weight gradient computation shortly (3us) to have
            # all-reduce scheduled first and have GPU resources allocated
            _ = torch.empty(1, device=grad_output.device) + 1
+        if ctx.sequence_parallel:
+            assert not ctx.async_grad_allreduce
+            dim_size = list(input.size())
+            sub_grad_input = torch.empty(dim_size, dtype=input.dtype,
+                                         device=torch.cuda.current_device(),
+                                         requires_grad=False)
+            # reduce_scatter
+            handle = torch.distributed._reduce_scatter_base(sub_grad_input, grad_input, 
+                                                            group=get_tensor_model_parallel_group(),
+                                                            async_op=True)
+            # Delay the start of weight gradient computation shortly (3us) to have
+            # reduce scatter scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
        if ctx.gradient_accumulation_fusion:
            import fused_dense_cuda
-            fused_dense_cuda.wgrad_gemm_accum_fp32(input, grad_output, weight.main_grad)
+            fused_dense_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, weight.main_grad)
            grad_weight = None
        else:
-            grad_weight = grad_output.t().matmul(input)
+            grad_weight = grad_output.t().matmul(total_input)
        grad_bias = grad_output.sum(dim=0) if use_bias else None
+        if ctx.sequence_parallel:
+            handle.wait()
+            return sub_grad_input, grad_weight, grad_bias, None, None, None
        if ctx.async_grad_allreduce:
            handle.wait()
-        return grad_input, grad_weight, grad_bias, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None
 class ColumnParallelLinear(torch.nn.Module):
@@ -323,23 +392,28 @@ class ColumnParallelLinear(torch.nn.Module):
        self.async_tensor_model_parallel_allreduce = (
                args.async_tensor_model_parallel_allreduce and
                world_size > 1)
+        self.sequence_parallel = (
+                args.sequence_parallel and
+                world_size > 1)
+        assert not self.async_tensor_model_parallel_allreduce or \
+            not self.sequence_parallel
        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion
    def forward(self, input_):
        bias = self.bias if not self.skip_bias_add else None
-        if self.async_tensor_model_parallel_allreduce:
+        if self.async_tensor_model_parallel_allreduce or \
+                self.sequence_parallel:
            input_parallel = input_
        else:
-            # Set up backprop all-reduce.
            input_parallel = copy_to_tensor_model_parallel_region(input_)
        # Matrix multiply.
-        output_parallel = LinearWithGradAccumulationAndAsyncAllreduce.apply(
+        output_parallel = LinearWithGradAccumulationAndAsyncCommunication.apply(
            input_parallel, self.weight, bias, self.gradient_accumulation_fusion,
-            self.async_tensor_model_parallel_allreduce)
+            self.async_tensor_model_parallel_allreduce, self.sequence_parallel)
        if self.gather_output:
            # All-gather across the partitions.
+            assert not self.sequence_parallel
            output = gather_from_tensor_model_parallel_region(output_parallel)
        else:
            output = output_parallel
@@ -420,26 +494,34 @@ class RowParallelLinear(torch.nn.Module):
                self.bias = Parameter(torch.empty(
                    self.output_size, device=torch.cuda.current_device(),
                    dtype=args.params_dtype))
+            setattr(self.bias, 'sequence_parallel', args.sequence_parallel)
            # Always initialize bias to zero.
            with torch.no_grad():
                self.bias.zero_()
        else:
            self.register_parameter('bias', None)
+        self.sequence_parallel = args.sequence_parallel
        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion
    def forward(self, input_):
        # Set up backprop all-reduce.
        if self.input_is_parallel:
            input_parallel = input_
        else:
+            assert not self.sequence_parallel
            input_parallel = scatter_to_tensor_model_parallel_region(input_)
        # Matrix multiply.
-        output_parallel = LinearWithGradAccumulationAndAsyncAllreduce.apply(
+        output_parallel = LinearWithGradAccumulationAndAsyncCommunication.apply(
            input_parallel, self.weight, None,
-            self.gradient_accumulation_fusion, None)
+            self.gradient_accumulation_fusion, None, None)
        # All-reduce across all the partitions.
-        output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+        if self.sequence_parallel:
+            output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
+        else:
+            output_ = reduce_from_tensor_model_parallel_region(output_parallel)
        if not self.skip_bias_add:
            output = output_ + self.bias if self.bias is not None else output_
            output_bias = None

--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
@@ -32,13 +32,13 @@ def _reduce(input_):
    return input_
-def _split(input_):
+def _split_along_last_dim(input_):
    """Split the tensor along its last dimension and keep the
    corresponding slice."""
    world_size = get_tensor_model_parallel_world_size()
    # Bypass the function if we are using only 1 GPU.
-    if world_size==1:
+    if world_size == 1:
        return input_
    # Split along last dimension.
@@ -51,12 +51,34 @@ def _split(input_):
    return output
-def _gather(input_):
+def _split_along_first_dim(input_):
+    """Split the tensor along its first dimension and keep the
+    corresponding slice."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+    # Split along first dimension.
+    dim_size = input_.size()[0]
+    assert dim_size % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    local_dim_size = dim_size // world_size
+    rank = get_tensor_model_parallel_rank()
+    dim_offset = rank * local_dim_size
+    output = input_[dim_offset:dim_offset+local_dim_size].contiguous()
+    return output
+def _gather_along_last_dim(input_):
    """Gather tensors and concatinate along the last dimension."""
    world_size = get_tensor_model_parallel_world_size()
    # Bypass the function if we are using only 1 GPU.
-    if world_size==1:
+    if world_size == 1:
        return input_
    # Size and dimension.
@@ -73,6 +95,44 @@ def _gather(input_):
    return output
+def _gather_along_first_dim(input_):
+    """Gather tensors and concatinate along the first dimension."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._all_gather_base(output, input_.contiguous(),
+                                       group=get_tensor_model_parallel_group())
+    return output
+def _reduce_scatter_along_first_dim(input_):
+    """Reduce-scatter the input tensor across model parallel group."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+    dim_size = list(input_.size())
+    assert dim_size[0] % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    dim_size[0] = dim_size[0] // world_size
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._reduce_scatter_base(output, input_.contiguous(), 
+                                           group=get_tensor_model_parallel_group())
+    return output
 class _CopyToModelParallelRegion(torch.autograd.Function):
    """Pass the input to the model parallel region."""
@@ -110,15 +170,15 @@ class _ScatterToModelParallelRegion(torch.autograd.Function):
    @staticmethod
    def symbolic(graph, input_):
-        return _split(input_)
+        return _split_along_last_dim(input_)
    @staticmethod
    def forward(ctx, input_):
-        return _split(input_)
+        return _split_along_last_dim(input_)
    @staticmethod
    def backward(ctx, grad_output):
-        return _gather(grad_output)
+        return _gather_along_last_dim(grad_output)
 class _GatherFromModelParallelRegion(torch.autograd.Function):
@@ -126,15 +186,63 @@ class _GatherFromModelParallelRegion(torch.autograd.Function):
    @staticmethod
    def symbolic(graph, input_):
-        return _gather(input_)
+        return _gather_along_last_dim(input_)
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather_along_last_dim(input_)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split_along_last_dim(grad_output)
+class _ScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_first_dim(input_)
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_first_dim(input_)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+class _GatherFromSequenceParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate.""" #TODO
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather_along_first_dim(input_)
    @staticmethod
    def forward(ctx, input_):
-        return _gather(input_)
+        return _gather_along_first_dim(input_)
    @staticmethod
    def backward(ctx, grad_output):
-        return _split(grad_output)
+        return _reduce_scatter_along_first_dim(grad_output)
+class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Reduce scatter the input from the model parallel region."""
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce_scatter_along_first_dim(input_)
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce_scatter_along_first_dim(input_)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
 # -----------------
@@ -155,3 +263,16 @@ def scatter_to_tensor_model_parallel_region(input_):
 def gather_from_tensor_model_parallel_region(input_):
    return _GatherFromModelParallelRegion.apply(input_)
+def scatter_to_sequence_parallel_region(input_):
+    return _ScatterToSequenceParallelRegion.apply(input_)
+def gather_from_sequence_parallel_region(input_):
+    return _GatherFromSequenceParallelRegion.apply(input_)
+def reduce_scatter_to_sequence_parallel_region(input_):
+    return _ReduceScatterToSequenceParallelRegion.apply(input_)
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -307,10 +307,10 @@ class CheckpointFunction(torch.autograd.Function):
              tracked/set/reset.
    """
    @staticmethod
-    def forward(ctx, run_function, distribute_checkpointed_activations, *args):
+    def forward(ctx, run_function, distribute_saved_activations, *args):
        ctx.run_function = run_function
-        ctx.distribute_checkpointed_activations \
+        ctx.distribute_saved_activations \
-            = distribute_checkpointed_activations
+            = distribute_saved_activations
        # Copy the rng states.
        ctx.fwd_cpu_rng_state = torch.get_rng_state()
@@ -322,7 +322,7 @@ class CheckpointFunction(torch.autograd.Function):
        # Divide hidden states across model parallel group and only keep
        # the chunk corresponding to the current rank.
-        if distribute_checkpointed_activations:
+        if distribute_saved_activations:
            ctx.input_0_shape = args[0].data.shape
            safely_set_viewless_tensor_data(
                args[0],
@@ -339,7 +339,7 @@ class CheckpointFunction(torch.autograd.Function):
            raise RuntimeError("Checkpointing is not compatible with .grad(), "
                               "please use .backward() if possible")
        inputs = ctx.saved_tensors
-        if ctx.distribute_checkpointed_activations:
+        if ctx.distribute_saved_activations:
            safely_set_viewless_tensor_data(
                inputs[0],
                gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
@@ -372,8 +372,8 @@ class CheckpointFunction(torch.autograd.Function):
        return (None, None) + grads
-def checkpoint(function, distribute_checkpointed_activations, *args):
+def checkpoint(function, distribute_saved_activations, *args):
    """Checkpoint a model or part of the model.
    This has been directly copied from torch.utils.checkpoint."""
    return CheckpointFunction.apply(function,
-                                    distribute_checkpointed_activations, *args)
+                                    distribute_saved_activations, *args)
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -17,7 +17,6 @@ from apex.optimizers import FusedAdam as Adam
 from apex.optimizers import FusedSGD as SGD
 from megatron import get_args
-from megatron.model import LayerNorm
 from .grad_scaler import ConstantGradScaler, DynamicGradScaler
 from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer

--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -264,7 +264,6 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
                        if param in self.optimizer.state:
                            self.optimizer.state[main_param] \
                                = self.optimizer.state.pop(param)
                    # fp32 params.
                    elif param.type() == 'torch.cuda.FloatTensor':
                        fp32_params_this_group.append(param)
@@ -282,10 +281,6 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
                fp32_from_float16_params_this_group)
            self.fp32_from_fp32_groups.append(fp32_params_this_group)
-        # Leverage state_dict() and load_state_dict() to
-        # recast preexisting per-param state tensors
-        self.optimizer.load_state_dict(self.optimizer.state_dict())
    def zero_grad(self, set_to_none=True):
        """We only need to zero the model related parameters, i.e.,

--- a/megatron/p2p_communication.py
+++ b/megatron/p2p_communication.py
@@ -61,7 +61,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
        tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
    override_scatter_gather_tensors_in_pipeline = False
-    if args.scatter_gather_tensors_in_pipeline:
+    if args.scatter_gather_tensors_in_pipeline and \
+            not args.sequence_parallel:
        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
        if tensor_chunk_shape % mpu.get_tensor_model_parallel_world_size() == 0:
            tensor_chunk_shape = tensor_chunk_shape // \
@@ -93,7 +94,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
    # Split tensor into smaller chunks if using scatter-gather optimization.
    if not override_scatter_gather_tensors_in_pipeline and \
-            args.scatter_gather_tensors_in_pipeline:
+            args.scatter_gather_tensors_in_pipeline and \
+            not args.sequence_parallel:
        if tensor_send_next is not None:
            tensor_send_next = mpu.split_tensor_into_1d_equal_chunks(tensor_send_next)
@@ -138,7 +140,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
    # If using scatter-gather optimization, gather smaller chunks.
    if not override_scatter_gather_tensors_in_pipeline and \
-            args.scatter_gather_tensors_in_pipeline:
+            args.scatter_gather_tensors_in_pipeline and \
+            not args.sequence_parallel:
        if recv_prev:
            tensor_recv_prev = mpu.gather_split_1d_tensor(
                tensor_recv_prev).view(tensor_shape).requires_grad_()

--- a/megatron/schedules.py
+++ b/megatron/schedules.py
@@ -279,8 +279,12 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
    pipeline_parallel_rank = mpu.get_pipeline_model_parallel_rank()
    args = get_args()
-    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+    tensor_shape = (seq_length, args.micro_batch_size, args.hidden_size)
    # Compute number of warmup and remaining microbatches.
    num_model_chunks = len(model)
    num_microbatches = get_num_microbatches() * num_model_chunks
@@ -514,18 +518,25 @@ def get_tensor_shapes(rank, model_type):
    # Otherwise, send one tensor (pre-transpose).
    args = get_args()
    tensor_shapes = []
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
    if model_type == ModelType.encoder_and_decoder:
+        if args.sequence_parallel:
+            decoder_seq_length = args.decoder_seq_length // mpu.get_tensor_model_parallel_world_size()
+        else:
+            decoder_seq_length = args.decoder_seq_length
        if mpu.is_pipeline_stage_before_split(rank):
-            # If next rank is after split, then need transpose for encoder_hidden_state.
+            tensor_shapes.append((seq_length, args.micro_batch_size, args.hidden_size))
-            if mpu.is_pipeline_stage_before_split(rank+1):
-                tensor_shapes.append((args.seq_length, args.micro_batch_size, args.hidden_size))
-            else:
-                tensor_shapes.append((args.micro_batch_size, args.seq_length, args.hidden_size))
        else:
-            tensor_shapes.append((args.decoder_seq_length, args.micro_batch_size, args.hidden_size))
+            tensor_shapes.append((decoder_seq_length, args.micro_batch_size, args.hidden_size))
-            tensor_shapes.append((args.micro_batch_size, args.seq_length, args.hidden_size))
+            tensor_shapes.append((seq_length, args.micro_batch_size, args.hidden_size))
    else:
-        tensor_shapes.append((args.seq_length, args.micro_batch_size, args.hidden_size))
+        tensor_shapes.append((seq_length, args.micro_batch_size, args.hidden_size))
    return tensor_shapes

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -23,6 +23,7 @@ import time
 _TRAIN_START_TIME = time.time()
 import torch
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from megatron import get_args
 from megatron import get_signal_handler
@@ -421,6 +422,25 @@ def train_step(forward_step_func, data_iterator,
    if args.empty_unused_memory_level >= 1:
        torch.cuda.empty_cache()
+    # All-reduce layernorm parameters across model parallel nodes
+    # when sequence parallelism is used
+    if mpu.get_tensor_model_parallel_world_size() > 1 and \
+            args.sequence_parallel:
+        grads = []
+        for model_module in model:
+            unwrapped_model = unwrap_model( 
+                model_module, (torchDDP, LocalDDP, Float16Module))
+            for param in unwrapped_model.parameters():
+                if getattr(param, 'sequence_parallel', False):
+                    grad = param.main_grad if args.DDP_impl == 'local' else param.grad
+                    grads.append(grad.data)
+        coalesced = _flatten_dense_tensors(grads)
+        torch.distributed.all_reduce(
+            coalesced, group=mpu.get_tensor_model_parallel_group())
+        for buf, synced in zip(grads, _unflatten_dense_tensors(
+                coalesced, grads)):
+            buf.copy_(synced)
    # All-reduce if needed.
    if args.DDP_impl == 'local':
        timers('backward-params-all-reduce').start()