Merge branch 'main' into checkpoint_util

80389ef6 · Jared Casper · 1b2db724 · d07d29df · 80389ef6 · 80389ef6
Commit 80389ef6 authored Jun 24, 2022 by Jared Casper
20 changed files
--- a/megatron/model/gpt_model.py
+++ b/megatron/model/gpt_model.py
@@ -32,20 +32,26 @@ def post_language_model_processing(lm_output, labels, logit_weights,
                                   parallel_output,
                                   fp16_lm_cross_entropy):

-    # Output.
+    # Output. Format [s b h]
    output = parallel_lm_logits(
        lm_output,
        logit_weights,
        parallel_output)

    if labels is None:
-        return output
+        # [s b h] => [b s h]
+        return output.transpose(0,1).contiguous()
    else:
+        # [b s] => [s b]
+        labels = labels.transpose(0,1).contiguous()
        if fp16_lm_cross_entropy:
            assert output.dtype == torch.half
            loss = mpu.vocab_parallel_cross_entropy(output, labels)
        else:
            loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
+        
+        # [s b] => [b, s]
+        loss = loss.transpose(0,1).contiguous()
        return loss



--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -26,17 +26,29 @@ from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal, scaled_init_method_normal

+
 def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
                       bias=None):
    """LM logits using word embedding weights."""
+    args = get_args()
    # Parallel logits.
-    input_parallel = mpu.copy_to_tensor_model_parallel_region(input_)
-    # Matrix multiply.
-    if bias is None:
-        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
+    if args.async_tensor_model_parallel_allreduce or\
+            args.sequence_parallel:
+        input_parallel = input_
+        model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+        async_grad_allreduce = args.async_tensor_model_parallel_allreduce and \
+            model_parallel and not args.sequence_parallel
    else:
-        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
+        input_parallel = mpu.copy_to_tensor_model_parallel_region(input_)
+        async_grad_allreduce = False
+
+    # Matrix multiply.
+    logits_parallel = mpu.LinearWithGradAccumulationAndAsyncCommunication.apply(
+        input_parallel, word_embeddings_weight, bias,
+        args.gradient_accumulation_fusion,
+        async_grad_allreduce, args.sequence_parallel)
    # Gather if needed.
+
    if parallel_output:
        return logits_parallel

@@ -92,12 +104,23 @@ class Pooler(MegatronModule):

    def __init__(self, hidden_size, init_method):
        super(Pooler, self).__init__()
+        args = get_args()
        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+        self.sequence_parallel = args.sequence_parallel
+

    def forward(self, hidden_states, sequence_index=0):
-        # hidden_states: [b, s, h]
+        # hidden_states: [s, b, h]
        # sequence_index: index of the token to pool.
-        pooled = hidden_states[:, sequence_index, :]
+
+        # gather data along sequence dimensions
+        # same pooler is run on all tensor parallel nodes
+        if self.sequence_parallel:
+            hidden_states = mpu.gather_from_sequence_parallel_region(
+                hidden_states,
+                tensor_parallel_output_grad=False)
+
+        pooled = hidden_states[sequence_index, :, :]
        pooled = self.dense(pooled)
        pooled = torch.tanh(pooled)
        return pooled
@@ -160,6 +183,8 @@ class Embedding(MegatronModule):
        else:
            self.tokentype_embeddings = None

+        self.fp32_residual_connection = args.fp32_residual_connection 
+        self.sequence_parallel = args.sequence_parallel
        # Embeddings dropout
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)

@@ -201,8 +226,20 @@ class Embedding(MegatronModule):
        else:
            assert self.tokentype_embeddings is None

+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+
        # Dropout.
-        embeddings = self.embedding_dropout(embeddings)
+        if self.sequence_parallel:
+            embeddings = mpu.scatter_to_sequence_parallel_region(embeddings)
+            with mpu.get_cuda_rng_tracker().fork():
+                embeddings = self.embedding_dropout(embeddings)
+        else:
+            embeddings = self.embedding_dropout(embeddings)

        return embeddings


--- a/megatron/model/t5_model.py
+++ b/megatron/model/t5_model.py
@@ -152,19 +152,24 @@ class T5Model(MegatronModule):

        if self.post_process and self.add_decoder:
            decoder_output, encoder_output = lm_output
-            # Output.
+            # Output. [s, b, h]
            lm_logits = self.lm_head(decoder_output,
                                     self.word_embeddings_weight())

            if lm_labels is None:
-                return lm_logits
+                # [s b h] => [b s h]
+                return lm_logits.transpose(0,1).contiguous()
            else:
+                # [b s] => [s b]
+                lm_labels = lm_labels.transpose(0,1).contiguous()
                if self.fp16_lm_cross_entropy:
                    assert lm_logits.dtype == torch.half
                    lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
                else:
                    lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
                                                               lm_labels)
+                # [s b] => [b s]
+                lm_loss = lm_loss.transpose(0,1).contiguous()
            return lm_loss
        elif self.add_decoder and not self.add_encoder:
            decoder_output, encoder_output = lm_output

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
--- a/megatron/model/vision/vit_backbone.py
+++ b/megatron/model/vision/vit_backbone.py
@@ -21,7 +21,6 @@ import torch
 import apex
 import torch.nn.functional as F
 from megatron import get_args
-from megatron.model import LayerNorm
 from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import (
    get_linear_layer,
@@ -148,6 +147,7 @@ class VitBackbone(MegatronModule):
                 post_process=True,
                 class_token=True,
                 single_token_output=False,
+                 post_layer_norm=True,
                 drop_path_rate=0.0):
        super(VitBackbone, self).__init__(share_word_embeddings=False)
        args = get_args()
@@ -165,6 +165,7 @@ class VitBackbone(MegatronModule):
        self.pre_process = pre_process
        self.post_process = post_process
        self.class_token = class_token
+        self.post_layer_norm = post_layer_norm
        self.hidden_size = args.hidden_size
        self.patch_dim = args.patch_dim
        self.img_h = args.img_h
@@ -218,6 +219,7 @@ class VitBackbone(MegatronModule):
            self.scaled_init_method,
            pre_process=self.pre_process,
            post_process=self.post_process,
+            post_layer_norm=self.post_layer_norm,
            drop_path_rate=self.drop_path_rate
        )


--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -49,17 +49,21 @@ from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pi
 from .initialize import initialize_model_parallel
 from .initialize import model_parallel_is_initialized

+from .layers import LinearWithGradAccumulationAndAsyncCommunication
 from .layers import ColumnParallelLinear
 from .layers import RowParallelLinear
 from .layers import VocabParallelEmbedding
 from .layers import (set_tensor_model_parallel_attributes,
                     set_defaults_if_not_set_tensor_model_parallel_attributes,
                     copy_tensor_model_parallel_attributes)
-                     
-from .mappings import copy_to_tensor_model_parallel_region
-from .mappings import gather_from_tensor_model_parallel_region
-from .mappings import reduce_from_tensor_model_parallel_region
-from .mappings import scatter_to_tensor_model_parallel_region
+ 
+from .mappings import  copy_to_tensor_model_parallel_region
+from .mappings import  reduce_from_tensor_model_parallel_region
+from .mappings import  scatter_to_tensor_model_parallel_region
+from .mappings import  gather_from_tensor_model_parallel_region
+from .mappings import  scatter_to_sequence_parallel_region
+from .mappings import  gather_from_sequence_parallel_region
+from .mappings import  reduce_scatter_to_sequence_parallel_region

 from .random import checkpoint
 from .random import get_cuda_rng_tracker

--- a/megatron/mpu/initialize.py
+++ b/megatron/mpu/initialize.py
@@ -54,6 +54,12 @@ _POSITION_EMBEDDING_GLOBAL_RANKS = None
 # rank when broadcasting from the first or last pipeline stage.
 _PIPELINE_GLOBAL_RANKS = None

+# A list of global ranks for each data parallel group to ease calculation of the source
+# rank when broadcasting weights from src to all other data parallel ranks
+_DATA_PARALLEL_GLOBAL_RANKS = None
+
+
+
 def is_unitialized():
    """Useful for code segments that may be accessed with or without mpu initialization"""
    return _DATA_PARALLEL_GROUP is None
@@ -124,6 +130,7 @@ def initialize_model_parallel(tensor_model_parallel_size_=1,

    # Build the data-parallel groups.
    global _DATA_PARALLEL_GROUP
+    global _DATA_PARALLEL_GLOBAL_RANKS
    assert _DATA_PARALLEL_GROUP is None, \
        'data parallel group is already initialized'
    all_data_parallel_group_ranks = []
@@ -137,6 +144,7 @@ def initialize_model_parallel(tensor_model_parallel_size_=1,
            group = torch.distributed.new_group(ranks)
            if rank in ranks:
                _DATA_PARALLEL_GROUP = group
+                _DATA_PARALLEL_GLOBAL_RANKS = ranks

    # Build the model-parallel groups.
    global _MODEL_PARALLEL_GROUP
@@ -478,11 +486,10 @@ def get_tensor_model_parallel_src_rank():

 def get_data_parallel_src_rank():
    """Calculate the global rank corresponding to the first local rank
-    in the tensor model parallel group."""
-    global_rank = torch.distributed.get_rank()
-    data_parallel_size = get_data_parallel_world_size()
-    num_data_parallel_groups = torch.distributed.get_world_size() // data_parallel_size
-    return global_rank % num_data_parallel_groups
+    in the data parallel group."""
+    assert _DATA_PARALLEL_GLOBAL_RANKS is not None, \
+        "Data parallel group is not initialized"
+    return _DATA_PARALLEL_GLOBAL_RANKS[0]


 def get_pipeline_model_parallel_first_rank():

--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -30,20 +30,21 @@ from .initialize import get_tensor_model_parallel_world_size
 from .initialize import get_tensor_model_parallel_group
 from .mappings import copy_to_tensor_model_parallel_region
 from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import gather_from_sequence_parallel_region
 from .mappings import reduce_from_tensor_model_parallel_region
 from .mappings import scatter_to_tensor_model_parallel_region
+from .mappings import reduce_scatter_to_sequence_parallel_region
+
 from .random import get_cuda_rng_tracker
 from .utils import divide
 from .utils import split_tensor_along_last_dim
 from .utils import VocabUtility
-from megatron import get_args
-
+from megatron import get_args, get_global_memory_buffer

 _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
                                      'partition_dim': -1,
                                      'partition_stride': 1}

-
 def param_is_not_tensor_parallel_duplicate(param):
    return (hasattr(param, 'tensor_model_parallel') and
            param.tensor_model_parallel) or (
@@ -201,16 +202,37 @@ class VocabParallelEmbedding(torch.nn.Module):
        return output


-class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
+class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
    """
-    Column-parallel linear layer execution with asynchronous all-reduce
-    execution in backprop.
+    Linear layer execution with asynchronous communication and gradient accumulation
+    fusion in backprop.
    """
+
    @staticmethod
-    def forward(ctx, input, weight, bias):
+    def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
+                async_grad_allreduce, sequence_parallel):
        ctx.save_for_backward(input, weight)
        ctx.use_bias = bias is not None
-        output = torch.matmul(input, weight.t())
+        ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
+        ctx.async_grad_allreduce = async_grad_allreduce
+        ctx.sequence_parallel = sequence_parallel
+      
+        if sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+
+            all_gather_buffer = \
+                get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+            torch.distributed._all_gather_base(
+                all_gather_buffer,
+                input,
+                group=get_tensor_model_parallel_group())
+            total_input = all_gather_buffer
+        else:
+            total_input = input
+
+        output = torch.matmul(total_input, weight.t())
        if bias is not None:
            output = output + bias
        return output
@@ -219,17 +241,75 @@ class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
    def backward(ctx, grad_output):
        input, weight = ctx.saved_tensors
        use_bias = ctx.use_bias
+        
+        if ctx.sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+
+            all_gather_buffer = \
+                get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+            handle = torch.distributed._all_gather_base(
+                all_gather_buffer,
+                input,
+                group=get_tensor_model_parallel_group(), async_op=True)
+
+            # Delay the start of intput gradient computation shortly (3us) to have
+            # gather scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
+            total_input = all_gather_buffer
+        else:
+            total_input = input
        grad_input = grad_output.matmul(weight)
-        # Asyncronous all-reduce
-        handle = torch.distributed.all_reduce(
-                grad_input, group=get_tensor_model_parallel_group(), async_op=True)
-        # Delay the start of weight gradient computation shortly (3us) to have
-        # all-reduce scheduled first and have GPU resources allocated
-        _ = torch.empty(1, device=grad_output.device) + 1
-        grad_weight = grad_output.t().matmul(input)
+
+        if ctx.sequence_parallel:
+            handle.wait()
+
+        # Convert the tensor shapes to 2D for execution compatibility
+        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
+                                       grad_output.shape[2])
+        total_input = total_input.view(total_input.shape[0] * total_input.shape[1],
+				       total_input.shape[2])
+ 
+        if ctx.async_grad_allreduce:
+            # Asynchronous all-reduce
+            handle = torch.distributed.all_reduce(
+                    grad_input, group=get_tensor_model_parallel_group(), async_op=True)
+            # Delay the start of weight gradient computation shortly (3us) to have
+            # all-reduce scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
+ 
+        if ctx.sequence_parallel:
+            assert not ctx.async_grad_allreduce
+            dim_size = list(input.size())
+            sub_grad_input = torch.empty(dim_size, dtype=input.dtype,
+                                         device=torch.cuda.current_device(),
+                                         requires_grad=False)
+            # reduce_scatter
+            handle = torch.distributed._reduce_scatter_base(sub_grad_input, grad_input, 
+                                                            group=get_tensor_model_parallel_group(),
+                                                            async_op=True)
+            # Delay the start of weight gradient computation shortly (3us) to have
+            # reduce scatter scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
+        
+
+        if ctx.gradient_accumulation_fusion:
+            import fused_dense_cuda
+            fused_dense_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, weight.main_grad)
+            grad_weight = None
+        else:
+            grad_weight = grad_output.t().matmul(total_input)
        grad_bias = grad_output.sum(dim=0) if use_bias else None
-        handle.wait()
-        return grad_input, grad_weight, grad_bias
+
+        if ctx.sequence_parallel:
+            handle.wait()
+            return sub_grad_input, grad_weight, grad_bias, None, None, None
+
+        if ctx.async_grad_allreduce:
+            handle.wait()
+
+        return grad_input, grad_weight, grad_bias, None, None, None


 class ColumnParallelLinear(torch.nn.Module):
@@ -242,7 +322,7 @@ class ColumnParallelLinear(torch.nn.Module):
        input_size: first dimension of matrix A.
        output_size: second dimension of matrix A.
        bias: If true, add bias
-        gather_output: If true, call all-gather on output and make Y avaiable
+        gather_output: If true, call all-gather on output and make Y available
                       to all GPUs, otherwise, every GPU will have its output
                       which is Y_i = XA_i
        init_method: method to initialize weights. Note that bias is always set
@@ -309,31 +389,30 @@ class ColumnParallelLinear(torch.nn.Module):
        else:
            self.register_parameter('bias', None)
        self.async_tensor_model_parallel_allreduce = (
-                not args.no_async_tensor_model_parallel_allreduce and
+                args.async_tensor_model_parallel_allreduce and
                world_size > 1)
-
-
+        self.sequence_parallel = (
+                args.sequence_parallel and
+                world_size > 1)
+        assert not self.async_tensor_model_parallel_allreduce or \
+            not self.sequence_parallel
+        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion

    def forward(self, input_):
        bias = self.bias if not self.skip_bias_add else None

-        if self.async_tensor_model_parallel_allreduce:
-            input_shape = input_.shape
-            input_ = input_.view(input_shape[0] * input_shape[1],input_shape[2])
-            # Maxtrix multiply with asynchronouse all-reduce execution
-            output_parallel = ColumnParallelLinearWithAsyncAllreduce.apply(
-                    input_, self.weight, bias)
-            output_parallel = output_parallel.view(
-                    input_shape[0], input_shape[1], output_parallel.shape[1])
+        if self.async_tensor_model_parallel_allreduce or \
+                self.sequence_parallel:
+            input_parallel = input_
        else:
-            # Set up backprop all-reduce.
            input_parallel = copy_to_tensor_model_parallel_region(input_)
-
-            # Matrix multiply.
-            output_parallel = F.linear(input_parallel, self.weight, bias)
-
+        # Matrix multiply.
+        output_parallel = LinearWithGradAccumulationAndAsyncCommunication.apply(
+            input_parallel, self.weight, bias, self.gradient_accumulation_fusion,
+            self.async_tensor_model_parallel_allreduce, self.sequence_parallel)
        if self.gather_output:
            # All-gather across the partitions.
+            assert not self.sequence_parallel
            output = gather_from_tensor_model_parallel_region(output_parallel)
        else:
            output = output_parallel
@@ -416,11 +495,15 @@ class RowParallelLinear(torch.nn.Module):
                self.bias = Parameter(torch.empty(
                    self.output_size, device=torch.cuda.current_device(),
                    dtype=args.params_dtype))
+            setattr(self.bias, 'sequence_parallel', args.sequence_parallel)
+
            # Always initialize bias to zero.
            with torch.no_grad():
                self.bias.zero_()
        else:
            self.register_parameter('bias', None)
+        self.sequence_parallel = args.sequence_parallel
+        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion



@@ -429,11 +512,17 @@ class RowParallelLinear(torch.nn.Module):
        if self.input_is_parallel:
            input_parallel = input_
        else:
+            assert not self.sequence_parallel
            input_parallel = scatter_to_tensor_model_parallel_region(input_)
        # Matrix multiply.
-        output_parallel = F.linear(input_parallel, self.weight)
+        output_parallel = LinearWithGradAccumulationAndAsyncCommunication.apply(
+            input_parallel, self.weight, None,
+            self.gradient_accumulation_fusion, None, None)
        # All-reduce across all the partitions.
-        output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+        if self.sequence_parallel:
+            output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
+        else:
+            output_ = reduce_from_tensor_model_parallel_region(output_parallel)
        if not self.skip_bias_add:
            output = output_ + self.bias if self.bias is not None else output_
            output_bias = None

--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
@@ -32,13 +32,13 @@ def _reduce(input_):
    return input_


-def _split(input_):
+def _split_along_last_dim(input_):
    """Split the tensor along its last dimension and keep the
    corresponding slice."""

    world_size = get_tensor_model_parallel_world_size()
    # Bypass the function if we are using only 1 GPU.
-    if world_size==1:
+    if world_size == 1:
        return input_

    # Split along last dimension.
@@ -51,12 +51,34 @@ def _split(input_):
    return output


-def _gather(input_):
+def _split_along_first_dim(input_):
+    """Split the tensor along its first dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along first dimension.
+    dim_size = input_.size()[0]
+    assert dim_size % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    local_dim_size = dim_size // world_size
+    rank = get_tensor_model_parallel_rank()
+    dim_offset = rank * local_dim_size
+
+    output = input_[dim_offset:dim_offset+local_dim_size].contiguous()
+
+    return output
+
+
+def _gather_along_last_dim(input_):
    """Gather tensors and concatinate along the last dimension."""

    world_size = get_tensor_model_parallel_world_size()
    # Bypass the function if we are using only 1 GPU.
-    if world_size==1:
+    if world_size == 1:
        return input_

    # Size and dimension.
@@ -73,6 +95,44 @@ def _gather(input_):
    return output


+def _gather_along_first_dim(input_):
+    """Gather tensors and concatinate along the first dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
+
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._all_gather_base(output, input_.contiguous(),
+                                       group=get_tensor_model_parallel_group())
+
+    return output
+
+def _reduce_scatter_along_first_dim(input_):
+    """Reduce-scatter the input tensor across model parallel group."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    assert dim_size[0] % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    
+    dim_size[0] = dim_size[0] // world_size
+   
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._reduce_scatter_base(output, input_.contiguous(), 
+                                           group=get_tensor_model_parallel_group())
+    return output
+
+
 class _CopyToModelParallelRegion(torch.autograd.Function):
    """Pass the input to the model parallel region."""

@@ -110,15 +170,15 @@ class _ScatterToModelParallelRegion(torch.autograd.Function):

    @staticmethod
    def symbolic(graph, input_):
-        return _split(input_)
+        return _split_along_last_dim(input_)

    @staticmethod
    def forward(ctx, input_):
-        return _split(input_)
+        return _split_along_last_dim(input_)

    @staticmethod
    def backward(ctx, grad_output):
-        return _gather(grad_output)
+        return _gather_along_last_dim(grad_output)


 class _GatherFromModelParallelRegion(torch.autograd.Function):
@@ -126,15 +186,73 @@ class _GatherFromModelParallelRegion(torch.autograd.Function):

    @staticmethod
    def symbolic(graph, input_):
-        return _gather(input_)
+        return _gather_along_last_dim(input_)
    
    @staticmethod
    def forward(ctx, input_):
-        return _gather(input_)
+        return _gather_along_last_dim(input_)

    @staticmethod
    def backward(ctx, grad_output):
-        return _split(grad_output)
+        return _split_along_last_dim(grad_output)
+
+
+class _ScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+class _GatherFromSequenceParallelRegion(torch.autograd.Function):
+    """Gather the input from sequence parallel region and concatinate.""" 
+
+    @staticmethod
+    def symbolic(graph, input_, tensor_parallel_output_grad=True):
+        return _gather_along_first_dim(input_)
+    
+    @staticmethod
+    def forward(ctx, input_, tensor_parallel_output_grad=True):
+        ctx.tensor_parallel_output_grad = tensor_parallel_output_grad
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        tensor_parallel_output_grad = ctx.tensor_parallel_output_grad
+
+        # If the computation graph after the gather operation is
+        # in the tensor parallel mode, output gradients need to reduce 
+        # scattered and whereas if the computation is duplicated, 
+        # output gradients need to be scattered.
+        if tensor_parallel_output_grad:
+            return _reduce_scatter_along_first_dim(grad_output), None
+        else:
+            return _split_along_first_dim(grad_output), None
+
+
+class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Reduce scatter the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce_scatter_along_first_dim(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce_scatter_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)


 # -----------------
@@ -155,3 +273,16 @@ def scatter_to_tensor_model_parallel_region(input_):

 def gather_from_tensor_model_parallel_region(input_):
    return _GatherFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_sequence_parallel_region(input_):
+    return _ScatterToSequenceParallelRegion.apply(input_)
+
+
+def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
+    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
+
+
+def reduce_scatter_to_sequence_parallel_region(input_):
+    return _ReduceScatterToSequenceParallelRegion.apply(input_)
+
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -307,10 +307,10 @@ class CheckpointFunction(torch.autograd.Function):
              tracked/set/reset.
    """
    @staticmethod
-    def forward(ctx, run_function, distribute_checkpointed_activations, *args):
+    def forward(ctx, run_function, distribute_saved_activations, *args):
        ctx.run_function = run_function
-        ctx.distribute_checkpointed_activations \
-            = distribute_checkpointed_activations
+        ctx.distribute_saved_activations \
+            = distribute_saved_activations

        # Copy the rng states.
        ctx.fwd_cpu_rng_state = torch.get_rng_state()
@@ -322,7 +322,7 @@ class CheckpointFunction(torch.autograd.Function):

        # Divide hidden states across model parallel group and only keep
        # the chunk corresponding to the current rank.
-        if distribute_checkpointed_activations:
+        if distribute_saved_activations:
            ctx.input_0_shape = args[0].data.shape
            safely_set_viewless_tensor_data(
                args[0],
@@ -339,7 +339,7 @@ class CheckpointFunction(torch.autograd.Function):
            raise RuntimeError("Checkpointing is not compatible with .grad(), "
                               "please use .backward() if possible")
        inputs = ctx.saved_tensors
-        if ctx.distribute_checkpointed_activations:
+        if ctx.distribute_saved_activations:
            safely_set_viewless_tensor_data(
                inputs[0],
                gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
@@ -372,8 +372,8 @@ class CheckpointFunction(torch.autograd.Function):
        return (None, None) + grads


-def checkpoint(function, distribute_checkpointed_activations, *args):
+def checkpoint(function, distribute_saved_activations, *args):
    """Checkpoint a model or part of the model.
    This has been directly copied from torch.utils.checkpoint."""
    return CheckpointFunction.apply(function,
-                                    distribute_checkpointed_activations, *args)
+                                    distribute_saved_activations, *args)
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -17,8 +17,8 @@ from apex.optimizers import FusedAdam as Adam
 from apex.optimizers import FusedSGD as SGD

 from megatron import get_args
-from megatron.model import LayerNorm

+from .distrib_optimizer import DistributedOptimizer
 from .grad_scaler import ConstantGradScaler, DynamicGradScaler
 from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer

@@ -105,7 +105,11 @@ def get_megatron_optimizer(model,
    if args.DDP_impl == 'local':
        params_have_main_grad = True

-    if args.fp16 or args.bf16:
+    # Mixed precision optimizer.
+    # - Note: both the Float16Optimizer and the DistributedOptimizer inherit
+    #   from the MixedPrecisionOptimizer, which manages any optimizer where
+    #   the model params and main params are distinct.
+    if args.fp16 or args.bf16 or args.use_distributed_optimizer:

        # Grad scaler:
        #    if loss-scale is provided, instantiate the constant scaler.
@@ -114,9 +118,11 @@ def get_megatron_optimizer(model,
        #    otherwise we are running in bf16 with no loss-scale so
        #       leave it as None.
        grad_scaler = None
+
        # Constant loss scale.
        if args.loss_scale:
            grad_scaler = ConstantGradScaler(args.loss_scale)
+
        # Dynamic loss scale.
        else:
            if args.fp16:
@@ -129,16 +135,22 @@ def get_megatron_optimizer(model,
                    hysteresis=args.hysteresis)

        # Megatron optimizer.
-        return Float16OptimizerWithFloat16Params(optimizer,
-                                                 args.clip_grad,
-                                                 args.log_num_zeros_in_grad,
-                                                 params_have_main_grad,
-                                                 args.use_contiguous_buffers_in_local_ddp,
-                                                 args.bf16,
-                                                 grad_scaler)
+        opt_ty = DistributedOptimizer \
+            if args.use_distributed_optimizer else \
+            Float16OptimizerWithFloat16Params
+        return opt_ty(optimizer,
+                      args.clip_grad,
+                      args.log_num_zeros_in_grad,
+                      params_have_main_grad,
+                      args.use_contiguous_buffers_in_local_ddp,
+                      args.fp16,
+                      args.bf16,
+                      grad_scaler,
+                      model)

    # FP32.
    return FP32Optimizer(optimizer, args.clip_grad,
                         args.log_num_zeros_in_grad,
                         params_have_main_grad,
-                         args.use_contiguous_buffers_in_local_ddp)
+                         args.use_contiguous_buffers_in_local_ddp,
+                         model)
--- a/megatron/optimizer/clip_grads.py
+++ b/megatron/optimizer/clip_grads.py
@@ -21,12 +21,13 @@ from torch._six import inf
 from apex.multi_tensor_apply import multi_tensor_applier
 import amp_C

-from megatron import mpu
 from megatron.model.module import param_is_not_shared
 from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate


-def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
+def clip_grad_norm_fp32(parameters, grads_for_norm,
+                        max_norm, norm_type=2,
+                        model_parallel_group=None):
    """Clips gradient norm of an iterable of parameters whose gradients
       are in fp32.

@@ -37,9 +38,13 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    Arguments:
        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
            single Tensor that will have gradients normalized
+        grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single
+            Tensor that will be used for calculating the grad norm.
        max_norm (float or int): max norm of the gradients
        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
            infinity norm.
+        model_parallel_group (group): given the nature of the distributed
+            optimizer, this is passed as an argument.

    Returns:
        Total norm of the parameters (viewed as a single vector).
@@ -47,25 +52,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):

    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
+    if isinstance(grads_for_norm, torch.Tensor):
+        grads_for_norm = [grads_for_norm]

-    # Filter parameters based on:
-    #   - grad should not be none
-    #   - parameter should not be shared
-    #   - should not be a replica due to tensor model parallelism
+    # Grads.
    grads = []
-    grads_for_norm = []
    for param in parameters:
-        grad_not_none = param.grad is not None
-        is_not_shared = param_is_not_shared(param)
-        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
-        if grad_not_none:
-            grad = param.grad.detach()
-        if grad_not_none:
-            # Make sure the grads are in fp32
+        if param.grad is not None:
            assert param.grad.type() == 'torch.cuda.FloatTensor'
-            grads.append(grad)
-        if grad_not_none and is_not_shared and is_not_tp_duplicate:
-            grads_for_norm.append(grad)
+            grads.append(param.grad.detach())

    # Norm parameters.
    max_norm = float(max_norm)
@@ -79,7 +74,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
        # Take max across all model-parallel GPUs.
        torch.distributed.all_reduce(total_norm_cuda,
                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=mpu.get_model_parallel_group())
+                                     group=model_parallel_group)
        total_norm = total_norm_cuda[0].item()

    else:
@@ -88,12 +83,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
            # Use apex's multi-tensor applier for efficiency reasons.
            # Multi-tensor applier takes a function and a list of list
            # and performs the operation on that list all in one kernel.
-            grad_norm, _ = multi_tensor_applier(
-                amp_C.multi_tensor_l2norm,
-                dummy_overflow_buf,
-                [grads_for_norm],
-                False # no per-parameter norm
-            )
+            if grads_for_norm:
+                grad_norm, _ = multi_tensor_applier(
+                    amp_C.multi_tensor_l2norm,
+                    dummy_overflow_buf,
+                    [grads_for_norm],
+                    False # no per-parameter norm
+                )
+            else:
+                grad_norm = torch.cuda.FloatTensor([0])
            # Since we will be summing across data parallel groups,
            # we need the pow(norm-type).
            total_norm = grad_norm ** norm_type
@@ -106,7 +104,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
        # Sum across all model-parallel GPUs.
        torch.distributed.all_reduce(total_norm,
                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=mpu.get_model_parallel_group())
+                                     group=model_parallel_group)
        total_norm = total_norm.item() ** (1.0 / norm_type)

    # Scale.
@@ -121,7 +119,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    return total_norm


-def count_zeros_fp32(parameters):
+def count_zeros_fp32(parameters, model_parallel_group):

    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
@@ -130,7 +128,7 @@ def count_zeros_fp32(parameters):
    #   - grad should not be none
    #   - parameter should not be shared
    #   - should not be a replica due to tensor model parallelism
-    total_num_zeros = 0.0
+    total_num_zeros = torch.cuda.FloatTensor([0.0])
    for param in parameters:
        grad_not_none = param.grad is not None
        is_not_shared = param_is_not_shared(param)
@@ -143,7 +141,8 @@ def count_zeros_fp32(parameters):
    # Sum across all model-parallel GPUs.
    torch.distributed.all_reduce(total_num_zeros,
                                 op=torch.distributed.ReduceOp.SUM,
-                                 group=mpu.get_model_parallel_group())
+                                 group=model_parallel_group)
+
    total_num_zeros = total_num_zeros.item()

    return total_num_zeros
--- a/megatron/optimizer/distrib_optimizer.py
+++ b/megatron/optimizer/distrib_optimizer.py
--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
--- a/megatron/p2p_communication.py
+++ b/megatron/p2p_communication.py
@@ -61,7 +61,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
        tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)

    override_scatter_gather_tensors_in_pipeline = False
-    if args.scatter_gather_tensors_in_pipeline:
+    if args.scatter_gather_tensors_in_pipeline and \
+            not args.sequence_parallel:
        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
        if tensor_chunk_shape % mpu.get_tensor_model_parallel_world_size() == 0:
            tensor_chunk_shape = tensor_chunk_shape // \
@@ -93,7 +94,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,

    # Split tensor into smaller chunks if using scatter-gather optimization.
    if not override_scatter_gather_tensors_in_pipeline and \
-            args.scatter_gather_tensors_in_pipeline:
+            args.scatter_gather_tensors_in_pipeline and \
+            not args.sequence_parallel:
        if tensor_send_next is not None:
            tensor_send_next = mpu.split_tensor_into_1d_equal_chunks(tensor_send_next)

@@ -138,7 +140,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,

    # If using scatter-gather optimization, gather smaller chunks.
    if not override_scatter_gather_tensors_in_pipeline and \
-            args.scatter_gather_tensors_in_pipeline:
+            args.scatter_gather_tensors_in_pipeline and \
+            not args.sequence_parallel:
        if recv_prev:
            tensor_recv_prev = mpu.gather_split_1d_tensor(
                tensor_recv_prev).view(tensor_shape).requires_grad_()

--- a/megatron/schedules.py
+++ b/megatron/schedules.py
@@ -279,8 +279,12 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
    pipeline_parallel_rank = mpu.get_pipeline_model_parallel_rank()

    args = get_args()
-    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
-
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+    tensor_shape = (seq_length, args.micro_batch_size, args.hidden_size)
+    
    # Compute number of warmup and remaining microbatches.
    num_model_chunks = len(model)
    num_microbatches = get_num_microbatches() * num_model_chunks
@@ -514,18 +518,25 @@ def get_tensor_shapes(rank, model_type):
    # Otherwise, send one tensor (pre-transpose).
    args = get_args()
    tensor_shapes = []
+
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+
    if model_type == ModelType.encoder_and_decoder:
+        if args.sequence_parallel:
+            decoder_seq_length = args.decoder_seq_length // mpu.get_tensor_model_parallel_world_size()
+        else:
+            decoder_seq_length = args.decoder_seq_length
+
        if mpu.is_pipeline_stage_before_split(rank):
-            # If next rank is after split, then need transpose for encoder_hidden_state.
-            if mpu.is_pipeline_stage_before_split(rank+1):
-                tensor_shapes.append((args.seq_length, args.micro_batch_size, args.hidden_size))
-            else:
-                tensor_shapes.append((args.micro_batch_size, args.seq_length, args.hidden_size))
+            tensor_shapes.append((seq_length, args.micro_batch_size, args.hidden_size))
        else:
-            tensor_shapes.append((args.decoder_seq_length, args.micro_batch_size, args.hidden_size))
-            tensor_shapes.append((args.micro_batch_size, args.seq_length, args.hidden_size))
+            tensor_shapes.append((decoder_seq_length, args.micro_batch_size, args.hidden_size))
+            tensor_shapes.append((seq_length, args.micro_batch_size, args.hidden_size))
    else:
-        tensor_shapes.append((args.seq_length, args.micro_batch_size, args.hidden_size))
+        tensor_shapes.append((seq_length, args.micro_batch_size, args.hidden_size))
    return tensor_shapes



--- a/megatron/text_generation/__init__.py
+++ b/megatron/text_generation/__init__.py
@@ -16,4 +16,5 @@

 from .api import (
    generate,
-    generate_and_post_process)
+    generate_and_post_process,
+    beam_search_and_post_process)
--- a/megatron/text_generation/api.py
+++ b/megatron/text_generation/api.py
@@ -22,7 +22,8 @@ from megatron import mpu
 from .communication import broadcast_float_list
 from .generation import (
        generate_tokens_probs_and_return_on_first_stage,
-        score_and_return_on_first_stage)
+        score_and_return_on_first_stage,
+        beam_search_and_return_on_first_stage)
 from .tokenization import (
    tokenize_prompts,
    detokenize_generations)
@@ -138,3 +139,54 @@ def generate(model,
        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
        stop_on_double_eol=stop_on_double_eol,
        stop_on_eol=stop_on_eol)
+
+def beam_search_and_post_process(model,
+                                 prompts=None,
+                                 tokens_to_generate=0,
+                                 beam_size=0,
+                                 add_BOS=False,
+                                 stop_token=50256,
+                                 num_return_gen=1,
+                                 length_penalty=1):
+    """Run beam search and post-process outputs, i.e., detokenize,
+    move to cpu and convert to list."""
+
+    # Main inference.
+    tokens, scores = beam_search(model,
+                                 prompts=prompts,
+                                 tokens_to_generate=tokens_to_generate,
+                                 beam_size=beam_size,
+                                 add_BOS=add_BOS,
+                                 stop_token=stop_token,
+                                 num_return_gen=num_return_gen,
+                                 length_penalty=length_penalty)
+    # Only post-process on first stage.
+    if mpu.is_pipeline_first_stage():
+        lengths = tokens.size(1)*torch.ones(beam_size, dtype=torch.int64, device=torch.cuda.current_device()) 
+        tokens, prompts_plus_generations, prompts_plus_generations_segments = detokenize_generations(tokens, lengths, True)
+        scores = scores.cpu().numpy().tolist()
+        return prompts_plus_generations, prompts_plus_generations_segments, scores
+
+    return None
+
+def beam_search(model, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=False, stop_token=50256, num_return_gen=1, length_penalty=1):
+    # Make sure input params are avaialble to all ranks.
+    values = [tokens_to_generate,
+              beam_size,
+              add_BOS,
+              stop_token,
+              num_return_gen,
+              length_penalty]
+    values_float_tensor = broadcast_float_list(6, float_list=values)
+    tokens_to_generate = int(values_float_tensor[0].item())
+    beam_size = int(values_float_tensor[1].item())
+    add_BOS = bool(values_float_tensor[2].item())
+    stop_token = int(values_float_tensor[3].item())
+    num_return_gen = int(values_float_tensor[4].item())
+    length_penalty = values_float_tensor[5].item()
+
+    context_tokens_tensor, context_length_tensor = tokenize_prompts(
+        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
+    
+    return beam_search_and_return_on_first_stage(model, context_tokens_tensor, context_length_tensor, 
+            beam_size, stop_token=stop_token, num_return_gen=num_return_gen, length_penalty=length_penalty)
--- a/megatron/text_generation/beam_utils.py
+++ b/megatron/text_generation/beam_utils.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+## from huggingface beam search
+class BeamHypotheses(object):
+    def __init__(self, num_beams, length_penalty=1.0, early_stopping=False):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self, hyp, sum_logprobs, length):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / length ** self.length_penalty
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp))
+            if len(self) > self.num_beams:
+                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
+                del self.beams[sorted_scores[0][1]]
+                self.worst_score = sorted_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs, cur_len):
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated
+        can become better than the worst one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
+
--- a/megatron/text_generation/forward_step.py
+++ b/megatron/text_generation/forward_step.py
@@ -42,7 +42,18 @@ class InferenceParams:
        self.batch_size_offset = 0
        self.key_value_memory_dict = {}

-
+    def swap_key_value_dict(self, batch_idx):
+        "swap between batches"
+        if len(self.key_value_memory_dict) == 0:
+            raise ValueError("should not swap when dict in empty")
+        
+        for layer_number in self.key_value_memory_dict.keys():
+            inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
+            assert len(batch_idx) == inference_key_memory.shape[1] ## make sure batch size is the same
+            new_inference_key_memory = inference_key_memory[:, batch_idx]
+            new_inference_value_memory = inference_value_memory[:, batch_idx]
+            self.key_value_memory_dict[layer_number] = (
+                    new_inference_key_memory, new_inference_value_memory)

 class ForwardStep:
    """Forward step function with all the communications.