merge with main branch

6fd0b406 · zihanl · 492fdf83 · 60750922 · 6fd0b406 · 6fd0b406
Commit 6fd0b406 authored Nov 25, 2021 by zihanl
20 changed files
--- a/megatron/optimizer/clip_grads.py
+++ b/megatron/optimizer/clip_grads.py
@@ -58,7 +58,8 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
        grad_not_none = param.grad is not None
        is_not_shared = param_is_not_shared(param)
        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
-        grad = param.grad.detach()
+        if grad_not_none:
+            grad = param.grad.detach()
        if grad_not_none:
            # Make sure the grads are in fp32
            assert param.grad.type() == 'torch.cuda.FloatTensor'

--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -68,7 +68,9 @@ class MegatronOptimizer(ABC):

    def __init__(self, optimizer, clip_grad,
                 log_num_zeros_in_grad,
-                 params_have_main_grad):
+                 params_have_main_grad,
+                 use_contiguous_buffers_in_local_ddp):
+
        """Input optimizer is the base optimizer for example Adam."""
        self.optimizer = optimizer
        assert self.optimizer, 'no optimizer is provided.'
@@ -76,7 +78,11 @@ class MegatronOptimizer(ABC):
        self.clip_grad = clip_grad
        self.log_num_zeros_in_grad = log_num_zeros_in_grad
        self.params_have_main_grad = params_have_main_grad
+        self.use_contiguous_buffers_in_local_ddp = use_contiguous_buffers_in_local_ddp

+        if self.use_contiguous_buffers_in_local_ddp:
+            assert self.params_have_main_grad, \
+                "use of contiguous buffer requires that params have main grad"

    def get_parameters(self):
        params = []
@@ -173,7 +179,7 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
            a `main_grad` field. If this is set, we are assuming
            that the model parameters are store in the `main_grad`
            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a contihuous buffer
+            for the DDP cases where there is a continuous buffer
            holding the gradients. For example for bfloat16, we want
            to do gradient accumulation and all-reduces in float32
            and as a result we store those gradients in the main_grad.
@@ -187,11 +193,12 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
    """

    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
-                 params_have_main_grad, bf16, grad_scaler):
+                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+                 bf16, grad_scaler):

        super(Float16OptimizerWithFloat16Params, self).__init__(
            optimizer, clip_grad, log_num_zeros_in_grad,
-            params_have_main_grad)
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp)

        self.bf16 = bf16
        self.grad_scaler = grad_scaler
@@ -282,9 +289,14 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):

    def zero_grad(self, set_to_none=True):
        """We only need to zero the model related parameters, i.e.,
-                float16_groups & fp32_from_fp32_groups."""
+        float16_groups & fp32_from_fp32_groups. We additionally zero
+        fp32_from_float16_groups as a memory optimization to reduce
+        fragmentation; in the case of set_to_none==True, the space
+        used by this field can be safely deallocated at this point."""
        for group in self.float16_groups:
            _zero_grad_group_helper(group, set_to_none)
+        for group in self.fp32_from_float16_groups:
+            _zero_grad_group_helper(group, set_to_none)
        for group in self.fp32_from_fp32_groups:
            _zero_grad_group_helper(group, set_to_none)

@@ -300,17 +312,31 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
        for model_group, main_group in zip(self.float16_groups,
                                           self.fp32_from_float16_groups):
            for model_param, main_param in zip(model_group, main_group):
-                if self.params_have_main_grad:
+                if self.params_have_main_grad and hasattr(model_param, 'main_grad'):
                    main_param.grad = model_param.main_grad.float()
                else:
                    if model_param.grad is not None:
                        main_param.grad = model_param.grad.float()
+
+                # Safe to deallocate model's grad/main_grad after copying.
+                # (If using contiguous buffers, main_grad's memory should
+                # persist and therefore should not be deallocated.)
+                model_param.grad = None
+                if self.params_have_main_grad and \
+                   not self.use_contiguous_buffers_in_local_ddp:
+                    model_param.main_grad = None
+
        # For fp32 grads, we need to reset the grads to main grad.
        if self.params_have_main_grad:
            for model_group in self.fp32_from_fp32_groups:
                for model_param in model_group:
                    model_param.grad = model_param.main_grad

+                    # Safe to de-reference model's main_grad after copying.
+                    # (If using contiguous buffers, main_grad's memory should
+                    # persist and therefore should not be deallocated.)
+                    if not self.use_contiguous_buffers_in_local_ddp:
+                        model_param.main_grad = None

    def _unscale_main_grads_and_check_for_nan(self):
        main_grads = []
@@ -464,11 +490,12 @@ class FP32Optimizer(MegatronOptimizer):

    def __init__(self, optimizer, clip_grad,
                 log_num_zeros_in_grad,
-                 params_have_main_grad):
+                 params_have_main_grad,
+                 use_contiguous_buffers_in_local_ddp):

        super(FP32Optimizer, self).__init__(
            optimizer, clip_grad, log_num_zeros_in_grad,
-            params_have_main_grad)
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp)

        self._scale = torch.cuda.FloatTensor([1.0])

@@ -495,6 +522,12 @@ class FP32Optimizer(MegatronOptimizer):
                for param in param_group['params']:
                    param.grad = param.main_grad

+                    # Safe to de-reference model's main_grad after copying.
+                    # (If using contiguous buffers, main_grad's memory should
+                    # persist and therefore should not be deallocated.)
+                    if not self.use_contiguous_buffers_in_local_ddp:
+                        param.main_grad = None
+
        # Clip gradients.
        grad_norm = None
        if self.clip_grad > 0.0:

--- a/megatron/p2p_communication.py
+++ b/megatron/p2p_communication.py
@@ -22,7 +22,9 @@ from megatron import mpu


 def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
-                 use_ring_exchange=False):
+                 tensor_shape,
+                 use_ring_exchange=False,
+                 dtype_=None):
    """Communicate tensors between stages. Used as helper method in other
    communication methods that are used in megatron/schedules.py.

@@ -35,9 +37,13 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
                   previous rank.
        recv_next: boolean for whether tensor should be received from
                   next rank.
+        tensor_shape: shape of tensor to receive (this method assumes that all
+                      tensors sent and received in a single function call are
+                      the same shape).
        use_ring_exchange: boolean for whether torch.distributed.ring_exchange()
                           API should be used.
-
+        dtype_: optional, this is used when the tensor that needs to be
+                communicated is different from args.params_dtype.
    Returns:
        (tensor_recv_prev, tensor_recv_next)
    """
@@ -47,28 +53,47 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
    # if needed.
    tensor_recv_prev = None
    tensor_recv_next = None
-    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+
+    # Some legacy inference code doesn't set the tensor shape, do so now
+    # for the normal values for gpt/bert. This could be removed if inference
+    # code is changed to provide tensor_shape.
+    if tensor_shape is None:
+        tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+
+    override_scatter_gather_tensors_in_pipeline = False
    if args.scatter_gather_tensors_in_pipeline:
-        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1) // \
-            mpu.get_tensor_model_parallel_world_size()
+        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
+        if tensor_chunk_shape % mpu.get_tensor_model_parallel_world_size() == 0:
+            tensor_chunk_shape = tensor_chunk_shape // \
+                mpu.get_tensor_model_parallel_world_size()
+        else:
+            tensor_chunk_shape = tensor_shape
+            override_scatter_gather_tensors_in_pipeline = True
    else:
        tensor_chunk_shape = tensor_shape
    dtype = args.params_dtype
    if args.fp32_residual_connection:
        dtype = torch.float
+
+    requires_grad = True
+    if dtype_ is not None:
+        dtype = dtype_
+        requires_grad = False
+
    if recv_prev:
        tensor_recv_prev = torch.empty(tensor_chunk_shape,
-                                       requires_grad=True,
+                                       requires_grad=requires_grad,
                                       device=torch.cuda.current_device(),
                                       dtype=dtype)
    if recv_next:
        tensor_recv_next = torch.empty(tensor_chunk_shape,
-                                       requires_grad=True,
+                                       requires_grad=requires_grad,
                                       device=torch.cuda.current_device(),
                                       dtype=dtype)

    # Split tensor into smaller chunks if using scatter-gather optimization.
-    if args.scatter_gather_tensors_in_pipeline:
+    if not override_scatter_gather_tensors_in_pipeline and \
+            args.scatter_gather_tensors_in_pipeline:
        if tensor_send_next is not None:
            tensor_send_next = mpu.split_tensor_into_1d_equal_chunks(tensor_send_next)

@@ -112,7 +137,8 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
    torch.cuda.synchronize()

    # If using scatter-gather optimization, gather smaller chunks.
-    if args.scatter_gather_tensors_in_pipeline:
+    if not override_scatter_gather_tensors_in_pipeline and \
+            args.scatter_gather_tensors_in_pipeline:
        if recv_prev:
            tensor_recv_prev = mpu.gather_split_1d_tensor(
                tensor_recv_prev).view(tensor_shape).requires_grad_()
@@ -124,8 +150,9 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
    return tensor_recv_prev, tensor_recv_next


-def recv_forward(timers=None):
+def recv_forward(tensor_shape=None, dtype_=None, timers=None):
    """Receive tensor from previous rank in pipeline (forward receive)."""
+
    if mpu.is_pipeline_first_stage():
        input_tensor = None
    else:
@@ -135,13 +162,15 @@ def recv_forward(timers=None):
            tensor_send_next=None,
            tensor_send_prev=None,
            recv_prev=True,
-            recv_next=False)
+            recv_next=False,
+            tensor_shape=tensor_shape,
+            dtype_=dtype_)
        if timers is not None:
            timers('forward-recv').stop()
    return input_tensor


-def recv_backward(timers=None):
+def recv_backward(tensor_shape=None, timers=None):
    """Receive tensor from next rank in pipeline (backward receive)."""
    if mpu.is_pipeline_last_stage():
        output_tensor_grad = None
@@ -152,14 +181,16 @@ def recv_backward(timers=None):
            tensor_send_next=None,
            tensor_send_prev=None,
            recv_prev=False,
-            recv_next=True)
+            recv_next=True,
+            tensor_shape=tensor_shape)
        if timers is not None:
            timers('backward-recv').stop()
    return output_tensor_grad


-def send_forward(output_tensor, timers=None):
+def send_forward(output_tensor, tensor_shape=None, dtype_=None, timers=None):
    """Send tensor to next rank in pipeline (forward send)."""
+
    if not mpu.is_pipeline_last_stage():
        if timers is not None:
            timers('forward-send').start()
@@ -167,12 +198,14 @@ def send_forward(output_tensor, timers=None):
            tensor_send_next=output_tensor,
            tensor_send_prev=None,
            recv_prev=False,
-            recv_next=False)
+            recv_next=False,
+            tensor_shape=tensor_shape,
+            dtype_=dtype_)
        if timers is not None:
            timers('forward-send').stop()


-def send_backward(input_tensor_grad, timers=None):
+def send_backward(input_tensor_grad, tensor_shape=None, timers=None):
    """Send tensor to previous rank in pipeline (backward send)."""
    if not mpu.is_pipeline_first_stage():
        if timers is not None:
@@ -181,12 +214,13 @@ def send_backward(input_tensor_grad, timers=None):
            tensor_send_next=None,
            tensor_send_prev=input_tensor_grad,
            recv_prev=False,
-            recv_next=False)
+            recv_next=False,
+            tensor_shape=tensor_shape)
        if timers is not None:
            timers('backward-send').stop()


-def send_forward_recv_backward(output_tensor, timers=None):
+def send_forward_recv_backward(output_tensor, tensor_shape=None, timers=None):
    """Batched send and recv with next rank in pipeline."""
    if mpu.is_pipeline_last_stage():
        output_tensor_grad = None
@@ -197,13 +231,14 @@ def send_forward_recv_backward(output_tensor, timers=None):
            tensor_send_next=output_tensor,
            tensor_send_prev=None,
            recv_prev=False,
-            recv_next=True)
+            recv_next=True,
+            tensor_shape=tensor_shape)
        if timers is not None:
            timers('forward-send-backward-recv').stop()
    return output_tensor_grad


-def send_backward_recv_forward(input_tensor_grad, timers=None):
+def send_backward_recv_forward(input_tensor_grad, tensor_shape=None, timers=None):
    """Batched send and recv with previous rank in pipeline."""
    if mpu.is_pipeline_first_stage():
        input_tensor = None
@@ -214,13 +249,14 @@ def send_backward_recv_forward(input_tensor_grad, timers=None):
            tensor_send_next=None,
            tensor_send_prev=input_tensor_grad,
            recv_prev=True,
-            recv_next=False)
+            recv_next=False,
+            tensor_shape=tensor_shape)
        if timers is not None:
            timers('backward-send-forward-recv').stop()
    return input_tensor


-def send_forward_recv_forward(output_tensor, recv_prev, timers=None):
+def send_forward_recv_forward(output_tensor, recv_prev, tensor_shape=None, timers=None):
    """Batched recv from previous rank and send to next rank in pipeline."""
    if timers is not None:
        timers('forward-send-forward-recv').start()
@@ -228,13 +264,14 @@ def send_forward_recv_forward(output_tensor, recv_prev, timers=None):
        tensor_send_next=output_tensor,
        tensor_send_prev=None,
        recv_prev=recv_prev,
-        recv_next=False)
+        recv_next=False,
+        tensor_shape=tensor_shape)
    if timers is not None:
        timers('forward-send-forward-recv').stop()
    return input_tensor


-def send_backward_recv_backward(input_tensor_grad, recv_next, timers=None):
+def send_backward_recv_backward(input_tensor_grad, recv_next, tensor_shape=None, timers=None):
    """Batched recv from next rank and send to previous rank in pipeline."""
    if timers is not None:
        timers('backward-send-backward-recv').start()
@@ -242,7 +279,8 @@ def send_backward_recv_backward(input_tensor_grad, recv_next, timers=None):
        tensor_send_next=None,
        tensor_send_prev=input_tensor_grad,
        recv_prev=False,
-        recv_next=recv_next)
+        recv_next=recv_next,
+        tensor_shape=tensor_shape)
    if timers is not None:
        timers('backward-send-backward-recv').stop()
    return output_tensor_grad
@@ -250,7 +288,7 @@ def send_backward_recv_backward(input_tensor_grad, recv_next, timers=None):

 def send_forward_backward_recv_forward_backward(
        output_tensor, input_tensor_grad, recv_prev,
-        recv_next, timers=None):
+        recv_next, tensor_shape=None, timers=None):
    """Batched send and recv with previous and next ranks in pipeline."""
    if timers is not None:
        timers('forward-backward-send-forward-backward-recv').start()
@@ -258,7 +296,8 @@ def send_forward_backward_recv_forward_backward(
        tensor_send_next=output_tensor,
        tensor_send_prev=input_tensor_grad,
        recv_prev=recv_prev,
-        recv_next=recv_next)
+        recv_next=recv_next,
+        tensor_shape=tensor_shape)
    if timers is not None:
        timers('forward-backward-send-forward-backward-recv').stop()
    return input_tensor, output_tensor_grad
--- a/megatron/schedules.py
+++ b/megatron/schedules.py
@@ -25,12 +25,17 @@ from megatron import p2p_communication
 from megatron.utils import unwrap_model
 from megatron.model import DistributedDataParallel as LocalDDP
 from megatron.model import Float16Module
+from megatron.model import ModelType
+

 def get_forward_backward_func():
    args = get_args()
    if mpu.get_pipeline_model_parallel_world_size() > 1:
        if args.virtual_pipeline_model_parallel_size is not None:
            forward_backward_func = forward_backward_pipelining_with_interleaving
+            assert get_num_microbatches() % args.pipeline_model_parallel_size == 0, \
+                'number of microbatches is not divisible by pipeline-parallel ' \
+                'size when using interleaved schedule'
        else:
            forward_backward_func = forward_backward_pipelining_without_interleaving
    else:
@@ -45,11 +50,18 @@ def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_r
    passed-in input_tensor is used.

    Returns output tensor."""
+    args = get_args()
    timers = get_timers()

    timers('forward-compute').start()
    unwrapped_model = unwrap_model(
        model, (torchDDP, LocalDDP, Float16Module))
+
+    unwrap_output_tensor = False
+    if not isinstance(input_tensor, list):
+        input_tensor = [input_tensor]
+        unwrap_output_tensor = True
+
    unwrapped_model.set_input_tensor(input_tensor)
    output_tensor, loss_func = forward_step_func(data_iterator, model)
    if mpu.is_pipeline_last_stage():
@@ -59,7 +71,15 @@ def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_r
        losses_reduced.append(loss_reduced)
    timers('forward-compute').stop()

-    return output_tensor
+    # If T5 model (or other model with encoder and decoder)
+    # and in decoder stack, then send encoder_hidden_state
+    # downstream as well.
+    if mpu.is_pipeline_stage_after_split() and \
+            args.model_type == ModelType.encoder_and_decoder:
+        return [output_tensor, input_tensor[-1]]
+    if unwrap_output_tensor:
+        return output_tensor
+    return [output_tensor]


 def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
@@ -70,24 +90,53 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):

    Returns gradient of loss with respect to input tensor (None if first
    stage)."""
+
+    # NOTE: This code currently can handle at most one skip connection. It
+    # needs to be modified slightly to support arbitrary numbers of skip
+    # connections.
    args = get_args()

    timers = get_timers()
    timers('backward-compute').start()

    # Retain the grad on the input_tensor.
-    if input_tensor is not None:
-        input_tensor.retain_grad()
+    unwrap_input_tensor_grad = False
+    if not isinstance(input_tensor, list):
+        input_tensor = [input_tensor]
+        unwrap_input_tensor_grad = True
+    for x in input_tensor:
+        if x is not None:
+            x.retain_grad()
+
+    if not isinstance(output_tensor, list):
+        output_tensor = [output_tensor]
+    if not isinstance(output_tensor_grad, list):
+        output_tensor_grad = [output_tensor_grad]

    # Backward pass.
-    if output_tensor_grad is None:
-        output_tensor = optimizer.scale_loss(output_tensor)
-    torch.autograd.backward(output_tensor, grad_tensors=output_tensor_grad)
+    if output_tensor_grad[0] is None:
+        output_tensor = optimizer.scale_loss(output_tensor[0])
+    torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0])

    # Collect the grad of the input_tensor.
-    input_tensor_grad = None
+    input_tensor_grad = [None]
    if input_tensor is not None:
-        input_tensor_grad = input_tensor.grad
+        input_tensor_grad = []
+        for x in input_tensor:
+            if x is None:
+                input_tensor_grad.append(None)
+            else:
+                input_tensor_grad.append(x.grad)
+
+    # Handle single skip connection if it exists (encoder_hidden_state in
+    # model with encoder and decoder).
+    if mpu.get_pipeline_model_parallel_world_size() > 1 and \
+            mpu.is_pipeline_stage_after_split() and \
+            args.model_type == ModelType.encoder_and_decoder:
+        if output_tensor_grad[1] is not None:
+            input_tensor_grad[-1].add_(output_tensor_grad[1])
+    if unwrap_input_tensor_grad:
+        input_tensor_grad = input_tensor_grad[0]

    timers('backward-compute').stop()

@@ -150,6 +199,9 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
    pipeline_parallel_size = mpu.get_pipeline_model_parallel_world_size()
    pipeline_parallel_rank = mpu.get_pipeline_model_parallel_rank()

+    args = get_args()
+    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+
    # Compute number of warmup and remaining microbatches.
    num_model_chunks = len(model)
    num_microbatches = get_num_microbatches() * num_model_chunks
@@ -191,6 +243,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
        model_chunk_id = get_model_chunk_id(microbatch_id, forward=True)
        mpu.set_virtual_pipeline_model_parallel_rank(model_chunk_id)

+        # forward step
        if mpu.is_pipeline_first_stage():
            if len(input_tensors[model_chunk_id]) == \
                    len(output_tensors[model_chunk_id]):
@@ -202,6 +255,11 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
                                     input_tensor, losses_reduced)
        output_tensors[model_chunk_id].append(output_tensor)

+        # if forward-only, no need to save tensors for a backward pass
+        if forward_only:
+            input_tensors[model_chunk_id].pop()
+            output_tensors[model_chunk_id].pop()
+
        return output_tensor

    def backward_step_helper(microbatch_id):
@@ -228,7 +286,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
    # Run warmup forward passes.
    mpu.set_virtual_pipeline_model_parallel_rank(0)
    input_tensors[0].append(
-        p2p_communication.recv_forward(timers))
+        p2p_communication.recv_forward(tensor_shape, timers=timers))
    for k in range(num_warmup_microbatches):
        output_tensor = forward_step_helper(k)

@@ -257,12 +315,15 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
                p2p_communication.send_forward_backward_recv_forward_backward(
                        output_tensor, input_tensor_grad,
                        recv_prev=recv_prev, recv_next=recv_next,
+                        tensor_shape=tensor_shape,
                        timers=timers)
            output_tensor_grads[num_model_chunks-1].append(output_tensor_grad)
        else:
            input_tensor = \
                p2p_communication.send_forward_recv_forward(
-                    output_tensor, recv_prev, timers)
+                    output_tensor, recv_prev=recv_prev,
+                    tensor_shape=tensor_shape,
+                    timers=timers)
        input_tensors[next_forward_model_chunk_id].append(input_tensor)

    # Run 1F1B in steady state.
@@ -326,7 +387,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
            p2p_communication.send_forward_backward_recv_forward_backward(
                    output_tensor, input_tensor_grad,
                    recv_prev=recv_prev, recv_next=recv_next,
-                    timers=timers)
+                    tensor_shape=tensor_shape, timers=timers)

        # Put input_tensor and output_tensor_grad in data structures in the
        # right location.
@@ -340,7 +401,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
    if not forward_only:
        if all_warmup_microbatches:
            output_tensor_grads[num_model_chunks-1].append(
-                p2p_communication.recv_backward(timers))
+                p2p_communication.recv_backward(tensor_shape, timers=timers))
        for k in range(num_microbatches_remaining, num_microbatches):
            input_tensor_grad = backward_step_helper(k)
            next_backward_model_chunk_id = get_model_chunk_id(k+1, forward=False)
@@ -352,11 +413,107 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
                recv_next = False
            output_tensor_grads[next_backward_model_chunk_id].append(
                p2p_communication.send_backward_recv_backward(
-                    input_tensor_grad, recv_next, timers))
+                    input_tensor_grad, recv_next=recv_next,
+                    tensor_shape=tensor_shape,
+                    timers=timers))

    return losses_reduced


+def get_tensor_shapes(rank, model_type):
+    # Determine right tensor sizes (based on position of rank with respect to split
+    # rank) and model size.
+    # Send two tensors if model is T5 and rank is in decoder stage:
+    #     first tensor is decoder (pre-transpose),
+    #     second tensor is encoder (post-transpose).
+    # If model is T5 and rank is at the boundary:
+    #     send one tensor (post-transpose from encoder).
+    # Otherwise, send one tensor (pre-transpose).
+    args = get_args()
+    tensor_shapes = []
+    if model_type == ModelType.encoder_and_decoder:
+        if mpu.is_pipeline_stage_before_split(rank):
+            # If next rank is after split, then need transpose for encoder_hidden_state.
+            if mpu.is_pipeline_stage_before_split(rank+1):
+                tensor_shapes.append((args.seq_length, args.micro_batch_size, args.hidden_size))
+            else:
+                tensor_shapes.append((args.micro_batch_size, args.seq_length, args.hidden_size))
+        else:
+            tensor_shapes.append((args.decoder_seq_length, args.micro_batch_size, args.hidden_size))
+            tensor_shapes.append((args.micro_batch_size, args.seq_length, args.hidden_size))
+    else:
+        tensor_shapes.append((args.seq_length, args.micro_batch_size, args.hidden_size))
+    return tensor_shapes
+
+
+def recv_forward(tensor_shapes, timers):
+    input_tensors = []
+    for tensor_shape in tensor_shapes:
+        if tensor_shape is None:
+            input_tensors.append(None)
+        else:
+            input_tensors.append(p2p_communication.recv_forward(tensor_shape,
+                                                                timers=timers))
+    return input_tensors
+
+
+def recv_backward(tensor_shapes, timers):
+    output_tensor_grads = []
+    for tensor_shape in tensor_shapes:
+        if tensor_shape is None:
+            output_tensor_grads.append(None)
+        else:
+            output_tensor_grads.append(p2p_communication.recv_backward(tensor_shape,
+                                                                       timers=timers))
+    return output_tensor_grads
+
+
+def send_forward(output_tensors, tensor_shapes, timers):
+    if not isinstance(output_tensors, list):
+        output_tensors = [output_tensors]
+    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
+        if tensor_shape is None:
+            continue
+        p2p_communication.send_forward(output_tensor, tensor_shape, timers=timers)
+
+
+def send_backward(input_tensor_grads, tensor_shapes, timers):
+    if not isinstance(input_tensor_grads, list):
+        input_tensor_grads = [input_tensor_grads]
+    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
+        if tensor_shape is None:
+            continue
+        p2p_communication.send_backward(input_tensor_grad, tensor_shape, timers=timers)
+
+
+def send_forward_recv_backward(output_tensors, tensor_shapes, timers):
+    if not isinstance(output_tensors, list):
+        output_tensors = [output_tensors]
+    output_tensor_grads = []
+    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
+        if tensor_shape is None:
+            output_tensor_grads.append(None)
+            continue
+        output_tensor_grad = p2p_communication.send_forward_recv_backward(
+                output_tensor, tensor_shape, timers=timers)
+        output_tensor_grads.append(output_tensor_grad)
+    return output_tensor_grads
+
+
+def send_backward_recv_forward(input_tensor_grads, tensor_shapes, timers):
+    if not isinstance(input_tensor_grads, list):
+        input_tensor_grads = [input_tensor_grads]
+    input_tensors = []
+    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
+        if tensor_shape is None:
+            input_tensors.append(None)
+            continue
+        input_tensor = p2p_communication.send_backward_recv_forward(
+                input_tensor_grad, tensor_shape, timers=timers)
+        input_tensors.append(input_tensor)
+    return input_tensors
+
+
 def forward_backward_pipelining_without_interleaving(forward_step_func, data_iterator,
                                                     model, optimizer, timers,
                                                     forward_only):
@@ -380,25 +537,37 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
    num_microbatches_remaining = \
        num_microbatches - num_warmup_microbatches

-    input_tensors = []
-    output_tensors = []
+    unwrapped_model = unwrap_model(
+        model, (torchDDP, LocalDDP, Float16Module))
+    model_type = unwrapped_model.model_type
+    rank = mpu.get_pipeline_model_parallel_rank()
+    recv_tensor_shapes = get_tensor_shapes(rank-1, model_type)
+    send_tensor_shapes = get_tensor_shapes(rank, model_type)
+
+    # Input, output tensors only need to be saved when doing backward passes
+    input_tensors = None
+    output_tensors = None
+    if not forward_only:
+        input_tensors = []
+        output_tensors = []
    losses_reduced = []

    # Run warmup forward passes.
    for i in range(num_warmup_microbatches):
-        input_tensor = p2p_communication.recv_forward(timers)
+        input_tensor = recv_forward(recv_tensor_shapes, timers=timers)
        output_tensor = forward_step(forward_step_func, data_iterator, model,
                                     input_tensor, losses_reduced)
-        p2p_communication.send_forward(output_tensor, timers)
+        send_forward(output_tensor, send_tensor_shapes, timers=timers)

-        input_tensors.append(input_tensor)
-        output_tensors.append(output_tensor)
+        if not forward_only:
+            input_tensors.append(input_tensor)
+            output_tensors.append(output_tensor)

    # Before running 1F1B, need to receive first forward tensor.
    # If all microbatches are run in warmup / cooldown phase, then no need to
    # receive this tensor here.
    if num_microbatches_remaining > 0:
-        input_tensor = p2p_communication.recv_forward(timers)
+        input_tensor = recv_forward(recv_tensor_shapes, timers=timers)

    # Run 1F1B in steady state.
    for i in range(num_microbatches_remaining):
@@ -407,22 +576,25 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
        output_tensor = forward_step(forward_step_func, data_iterator, model,
                                     input_tensor, losses_reduced)
        if forward_only:
-            p2p_communication.send_forward(output_tensor, timers)
+            send_forward(output_tensor, send_tensor_shapes, timers=timers)
+
+            if not last_iteration:
+                input_tensor = recv_forward(recv_tensor_shapes, timers=timers)
+
        else:
            output_tensor_grad = \
-                p2p_communication.send_forward_recv_backward(output_tensor,
-                                                             timers)
+                send_forward_recv_backward(output_tensor,
+                                           send_tensor_shapes,
+                                           timers=timers)

-        # Add input_tensor and output_tensor to end of list, then pop from the
-        # start of the list for backward pass.
-        input_tensors.append(input_tensor)
-        output_tensors.append(output_tensor)
+            # Add input_tensor and output_tensor to end of list.
+            input_tensors.append(input_tensor)
+            output_tensors.append(output_tensor)

-        if forward_only:
-            if not last_iteration:
-                input_tensor = p2p_communication.recv_forward(timers)
-        else:
-            input_tensor, output_tensor = input_tensors.pop(0), output_tensors.pop(0)
+            # Pop input_tensor and output_tensor from the start of the list for
+            # the backward pass.
+            input_tensor = input_tensors.pop(0)
+            output_tensor = output_tensors.pop(0)

            input_tensor_grad = \
                backward_step(optimizer, input_tensor, output_tensor,
@@ -430,11 +602,11 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite

            if last_iteration:
                input_tensor = None
-                p2p_communication.send_backward(input_tensor_grad, timers)
+                send_backward(input_tensor_grad, recv_tensor_shapes, timers=timers)
            else:
                input_tensor = \
-                    p2p_communication.send_backward_recv_forward(
-                        input_tensor_grad, timers)
+                    send_backward_recv_forward(
+                        input_tensor_grad, recv_tensor_shapes, timers=timers)

    # Run cooldown backward passes.
    if not forward_only:
@@ -442,12 +614,12 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
            input_tensor = input_tensors.pop(0)
            output_tensor = output_tensors.pop(0)

-            output_tensor_grad = p2p_communication.recv_backward(timers)
+            output_tensor_grad = recv_backward(send_tensor_shapes, timers=timers)

            input_tensor_grad = \
                backward_step(optimizer, input_tensor, output_tensor,
                              output_tensor_grad)

-            p2p_communication.send_backward(input_tensor_grad, timers)
+            send_backward(input_tensor_grad, recv_tensor_shapes, timers=timers)

    return losses_reduced
--- a/megatron/package_info.py
+++ b/megatron/package_info.py
@@ -13,18 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-MAJOR = 1
-MINOR = 1.5
-
-# Use the following formatting: (major, minor)
-VERSION = (MAJOR, MINOR)
-
-__version__ = '.'.join(map(str, VERSION))
-__package_name__ = 'megatron-lm'
-__contact_names__ = 'NVIDIA INC'
-__url__ = 'https://github.com/NVIDIA/Megatron-LM'
-__download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
-__description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
-__license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
-__keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'

+from .api import (
+    generate,
+    generate_and_post_process)
--- a/megatron/text_generation/api.py
+++ b/megatron/text_generation/api.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Inference API."""
+
+
+import torch
+
+from megatron import mpu
+from .communication import broadcast_float_list
+from .generation import (
+        generate_tokens_probs_and_return_on_first_stage,
+        score_and_return_on_first_stage)
+from .tokenization import (
+    tokenize_prompts,
+    detokenize_generations)
+
+def generate_and_post_process(model,
+                              prompts=None,
+                              tokens_to_generate=0,
+                              return_output_log_probs=False,
+                              top_k_sampling=0,
+                              top_p_sampling=0.0,
+                              temperature=1.0,
+                              add_BOS=False,
+                              use_eod_token_for_early_termination=True):
+    """Run inference and post-process outputs, i.e., detokenize,
+    move to cpu and convert to list."""
+
+    # Main inference.
+    tokens, lengths, output_log_probs = generate(
+        model,
+        prompts=prompts,
+        tokens_to_generate=tokens_to_generate,
+        return_output_log_probs=return_output_log_probs,
+        top_k_sampling=top_k_sampling,
+        top_p_sampling=top_p_sampling,
+        temperature=temperature,
+        add_BOS=add_BOS,
+        use_eod_token_for_early_termination=use_eod_token_for_early_termination)
+
+    # Only post-process on first stage.
+    if mpu.is_pipeline_first_stage():
+        tokens, prompts_plus_generations, prompts_plus_generations_segments = \
+            detokenize_generations(tokens, lengths, True)
+
+        if return_output_log_probs:
+            output_log_probs = output_log_probs.cpu().numpy().tolist()
+            for i, (prob, seg) in enumerate(zip(output_log_probs, prompts_plus_generations_segments)):
+                output_log_probs[i] = prob[:len(seg)-1]
+
+        return prompts_plus_generations, prompts_plus_generations_segments, \
+            output_log_probs, tokens
+
+    return None
+
+def generate(model,
+             prompts=None,
+             tokens_to_generate=0,
+             return_output_log_probs=False,
+             top_k_sampling=0,
+             top_p_sampling=0.0,
+             temperature=1.0,
+             add_BOS=False,
+             use_eod_token_for_early_termination=True):
+    """Given prompts and input parameters, run inference and return:
+       tokens: prompts plus the generated tokens.
+       lengths: length of the prompt + generations. Note that we can
+           discard tokens in the tokens tensor that are after the
+           corresponding length.
+       output_log_probs: log probs of the tokens.
+    """
+
+    # Make sure input params are avaialble to all ranks.
+    values = [tokens_to_generate,
+              return_output_log_probs,
+              top_k_sampling, top_p_sampling,
+              temperature, add_BOS, use_eod_token_for_early_termination]
+    values_float_tensor = broadcast_float_list(7, float_list=values)
+    tokens_to_generate = int(values_float_tensor[0].item())
+    return_output_log_probs = bool(values_float_tensor[1].item())
+    top_k_sampling = int(values_float_tensor[2].item())
+    top_p_sampling = values_float_tensor[3].item()
+    temperature = values_float_tensor[4].item()
+    add_BOS = bool(values_float_tensor[5].item())
+    use_eod_token_for_early_termination = bool(values_float_tensor[6].item())
+
+    # Tokenize prompts and get the batch.
+    # Note that these tensors are broadcaseted to all ranks.
+    if torch.distributed.get_rank() == 0:
+        assert prompts is not None
+    
+    context_tokens_tensor, context_length_tensor = tokenize_prompts(
+        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
+
+    if tokens_to_generate == 0:
+        return score_and_return_on_first_stage(
+            model, context_tokens_tensor, context_length_tensor)
+
+    # Main inference function.
+    # Note that the outputs are available on the first stage.
+    return generate_tokens_probs_and_return_on_first_stage(
+        model, context_tokens_tensor, context_length_tensor,
+        return_output_log_probs=return_output_log_probs,
+        top_k=top_k_sampling,
+        top_p=top_p_sampling,
+        temperature=temperature,
+        use_eod_token_for_early_termination=use_eod_token_for_early_termination)
--- a/megatron/text_generation/communication.py
+++ b/megatron/text_generation/communication.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Communications utilities."""
+
+
+import torch
+
+from megatron import mpu
+
+
+
+# TODO: use functions from megatron/p2p
+def recv_from_prev_pipeline_rank_(recv_buffer=None):
+    """Receive from previous pipeline stage and update the
+    input buffer inplace."""
+    if not mpu.is_pipeline_first_stage():
+        assert recv_buffer is not None
+        recv_prev_op = torch.distributed.P2POp(
+            torch.distributed.irecv, recv_buffer,
+            mpu.get_pipeline_model_parallel_prev_rank())
+        reqs = torch.distributed.batch_isend_irecv([recv_prev_op])
+        for req in reqs:
+            req.wait()
+        # To protect against race condition when using batch_isend_irecv().
+        torch.cuda.synchronize()
+
+
+
+# TODO: use functions from megatron/p2p
+def send_to_next_pipeline_rank(tensor=None):
+    """Send output to the next pipeline stage."""
+    if not mpu.is_pipeline_last_stage():
+        assert tensor is not None
+        send_next_op = torch.distributed.P2POp(
+            torch.distributed.isend, tensor,
+            mpu.get_pipeline_model_parallel_next_rank())
+        reqs = torch.distributed.batch_isend_irecv([send_next_op])
+        for req in reqs:
+            req.wait()
+        # To protect against race condition when using batch_isend_irecv().
+        torch.cuda.synchronize()
+
+
+
+def _is_cuda(tensor):
+    """Check if a tensor is not none and is cuda."""
+    assert tensor is not None
+    assert tensor.is_cuda
+
+
+
+def _is_cuda_contiguous(tensor):
+    """Check if a tensor is not none, is cuda, and is contiguous."""
+    _is_cuda(tensor)
+    assert tensor.is_contiguous()
+
+
+
+def broadcast_from_last_pipeline_stage(size, dtype, tensor=None):
+    """Broadcast a tensor from last pipeline stage to all ranks."""
+
+    is_last_stage = mpu.is_pipeline_last_stage()
+    # If first stage and last state are the same, then there is no
+    # pipeline parallelism and no need to communicate.
+    if mpu.is_pipeline_first_stage() and is_last_stage:
+        return tensor
+
+    if is_last_stage:
+        _is_cuda_contiguous(tensor)
+    else:
+        tensor = torch.empty(size,
+                             dtype=dtype,
+                             device=torch.cuda.current_device())
+    # Get the group and corresponding source rank.
+    src = mpu.get_pipeline_model_parallel_last_rank()
+    group = mpu.get_pipeline_model_parallel_group()
+    torch.distributed.broadcast(tensor, src, group)
+
+    return tensor
+
+
+
+def broadcast_from_last_to_first_pipeline_stage(size, dtype, tensor=None):
+    """Broadcast tensor values from last stage into the first stage."""
+
+    is_last_stage = mpu.is_pipeline_last_stage()
+    is_first_stage = mpu.is_pipeline_first_stage()
+    # If first stage and last state are the same, then there is no
+    # pipeline parallelism and no need to communicate.
+    if is_first_stage and is_last_stage:
+        return tensor
+    # Only first and last stage pipeline stages need to be involved.
+    if is_last_stage or is_first_stage:
+        if is_last_stage:
+            _is_cuda_contiguous(tensor)
+        else:
+            tensor = torch.empty(size,
+                                 dtype=dtype,
+                                 device=torch.cuda.current_device())
+        src = mpu.get_pipeline_model_parallel_last_rank()
+        group = mpu.get_embedding_group()
+        # Broadcast from last stage into the first stage.
+        torch.distributed.broadcast(tensor, src, group)
+    else:
+        tensor = None
+
+    return tensor
+
+
+
+def copy_from_last_to_first_pipeline_stage(size, dtype, tensor=None):
+    """Copy tensor values from last stage into the first stage.
+    Note that the input tensor is updated in place."""
+
+    is_last_stage = mpu.is_pipeline_last_stage()
+    is_first_stage = mpu.is_pipeline_first_stage()
+    # If first stage and last state are the same, then there is no
+    # pipeline parallelism and no need to communicate.
+    if is_first_stage and is_last_stage:
+        return
+    # Only first and last stage pipeline stages need to be involved.
+    if is_last_stage or is_first_stage:
+        _is_cuda(tensor)
+        is_contiguous = tensor.is_contiguous()
+        src = mpu.get_pipeline_model_parallel_last_rank()
+        group = mpu.get_embedding_group()
+        if is_contiguous:
+            tensor_ = tensor
+        else:
+            if is_last_stage:
+                tensor_ = tensor.contiguous()
+            else:
+                tensor_ = torch.empty(size,
+                                      dtype=dtype,
+                                      device=torch.cuda.current_device())
+        # Broadcast from last stage into the first stage.
+        torch.distributed.broadcast(tensor_, src, group)
+        # Update the first stage tensor
+        if is_first_stage and not is_contiguous:
+            tensor[...] = tensor_
+
+
+
+def broadcast_tensor(size, dtype, tensor=None, rank=0):
+    """ Given size and type of a tensor on all ranks and the tensor value
+        only on a specific rank, broadcast from that rank to all other ranks.
+    """
+
+    if torch.distributed.get_rank() == rank:
+        _is_cuda_contiguous(tensor)
+    else:
+        tensor = torch.empty(size,
+                             dtype=dtype,
+                             device=torch.cuda.current_device())
+
+    torch.distributed.broadcast(tensor, rank)
+
+    return tensor
+
+
+
+def broadcast_list(size, dtype, list_values=None, rank=0):
+    """Broadcast a list of values with a given type."""
+
+    tensor = None
+    if torch.distributed.get_rank() == rank:
+        tensor = torch.tensor(list_values, dtype=dtype,
+                              device=torch.cuda.current_device())
+
+    return broadcast_tensor(size, dtype, tensor=tensor, rank=rank)
+
+
+
+def broadcast_int_list(size, int_list=None, rank=0):
+    """Broadcast a list of interger values."""
+
+    return broadcast_list(size, torch.int64, list_values=int_list, rank=rank)
+
+
+
+def broadcast_float_list(size, float_list=None, rank=0):
+    """Broadcast a list of float values."""
+
+    return broadcast_list(size, torch.float32, list_values=float_list,
+                          rank=rank)
--- a/megatron/text_generation/forward_step.py
+++ b/megatron/text_generation/forward_step.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Forward step utilities."""
+
+from collections.abc import Iterable
+
+import torch
+
+from megatron import (
+    get_args,
+    mpu)
+from .communication import (
+    send_to_next_pipeline_rank,
+    recv_from_prev_pipeline_rank_)
+
+
+
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    def __init__(self, max_batch_size, max_sequence_len):
+        """Note that offsets are set to zero and we always set the
+        flag to allocate memory. After the first call, make sure to
+        set this flag to False."""
+        self.max_sequence_len = max_sequence_len
+        self.max_batch_size = max_batch_size
+        self.sequence_len_offset = 0
+        self.batch_size_offset = 0
+        self.key_value_memory_dict = {}
+
+
+
+class ForwardStep:
+    """Forward step function with all the communications.
+    We use a class here to hide the inference parameters
+    from the outside caller."""
+
+    def __init__(self, model, max_batch_size, max_sequence_len):
+        """Set values so we don't need to do it multiple times."""
+        # Make sure model is in eval mode.
+        assert not isinstance(model, Iterable), \
+            'interleaving schedule is not supported for inference'
+        model.eval()
+        self.model = model
+        # Initialize inference parameters.
+        self.inference_params = InferenceParams(max_batch_size,
+                                                max_sequence_len)
+        # Pipelining arguments.
+        args = get_args()
+        self.pipeline_size_larger_than_one = (
+            args.pipeline_model_parallel_size > 1)
+        # Threshold of pipelining.
+        self.pipelining_batch_x_seqlen = \
+            args.inference_batch_times_seqlen_threshold
+
+
+    def __call__(self, tokens, position_ids, attention_mask):
+        """Invocation of the forward methods. Note that self.inference_params
+        is being modified by the forward step."""
+        # Pipelining case.
+        if self.pipeline_size_larger_than_one:
+            current_batch_x_seqlen = tokens.size(0) * tokens.size(1)
+            if current_batch_x_seqlen >= self.pipelining_batch_x_seqlen:
+                micro_batch_size = \
+                    max(1, self.pipelining_batch_x_seqlen // tokens.size(1))
+                return _with_pipelining_forward_step(self.model,
+                                                     tokens,
+                                                     position_ids,
+                                                     attention_mask,
+                                                     self.inference_params,
+                                                     micro_batch_size)
+
+        return _no_pipelining_forward_step(self.model,
+                                           tokens,
+                                           position_ids,
+                                           attention_mask,
+                                           self.inference_params)
+
+
+
+def _get_recv_buffer_dtype(args):
+    """Receive happens between the layers."""
+    if args.fp32_residual_connection:
+        return torch.float
+    return args.params_dtype
+
+
+
+def _allocate_recv_buffer(batch_size, sequence_length):
+    """Receive happens between the layers with size [s, b, h]."""
+    if mpu.is_pipeline_first_stage():
+        return None
+    args = get_args()
+    recv_size = (sequence_length, batch_size, args.hidden_size)
+    return torch.empty(recv_size,
+                       dtype=_get_recv_buffer_dtype(args),
+                       device=torch.cuda.current_device())
+
+
+
+def _forward_step_helper(model, tokens, position_ids, attention_mask,
+                         inference_params, recv_buffer=None):
+    """Single forward step. Update the allocate memory flag so
+    only the first time the memory is allocated."""
+    batch_size = tokens.size(0)
+    sequence_length = tokens.size(1)
+    if recv_buffer is None:
+        recv_buffer = _allocate_recv_buffer(batch_size, sequence_length)
+
+    # Receive from previous stage.
+    recv_from_prev_pipeline_rank_(recv_buffer)
+
+    # Forward pass through the model.
+    model.set_input_tensor(recv_buffer)
+    output_tensor = model(tokens, position_ids, attention_mask,
+                          inference_params=inference_params)
+
+    # Send output to the next stage.
+    send_to_next_pipeline_rank(output_tensor)
+
+    return output_tensor
+
+
+
+def _no_pipelining_forward_step(model, tokens, position_ids, attention_mask,
+                                inference_params, recv_buffer=None):
+    """If recv_buffer is none, we will allocate one on the fly."""
+    # Run a simple forward pass.
+    output_tensor = _forward_step_helper(model, tokens, position_ids,
+                                         attention_mask, inference_params,
+                                         recv_buffer=recv_buffer)
+    # Update the sequence length offset.
+    inference_params.sequence_len_offset += tokens.size(1)
+
+    logits = None
+    if mpu.is_pipeline_last_stage():
+        logits = output_tensor
+
+    return logits
+
+
+
+def _with_pipelining_forward_step(model, tokens, position_ids, attention_mask,
+                                  inference_params, micro_batch_size):
+    """No interleaving is supported."""
+    sequence_length = tokens.size(1)
+    batch_size = tokens.size(0)
+
+    # Divide the batch dimension into micro batches.
+    num_micro_batches, last_chunk = divmod(batch_size,
+                                           micro_batch_size)
+    if last_chunk > 0:
+        num_micro_batches += 1
+
+    # Preallocate memory for output logits.
+    logits = None
+    if mpu.is_pipeline_last_stage():
+        args = get_args()
+        logits = torch.empty(
+            (batch_size, sequence_length, args.padded_vocab_size),
+            dtype=torch.float32, device=torch.cuda.current_device())
+
+    # Preallocate recv buffer.
+    recv_buffer = _allocate_recv_buffer(micro_batch_size, sequence_length)
+
+    for micro_batch_index in range(num_micro_batches):
+        # Slice among the batch dimenion.
+        start = micro_batch_index * micro_batch_size
+        end = min(start + micro_batch_size, batch_size)
+        this_micro_batch_size = end - start
+        tokens2use = tokens[start:end, ...]
+        position_ids2use = position_ids[start:end, ...]
+
+        # Run a simple forward pass.
+        if this_micro_batch_size != micro_batch_size:
+            recv_buffer = None
+        output = _forward_step_helper(model, tokens2use, position_ids2use,
+                                      attention_mask, inference_params,
+                                      recv_buffer=recv_buffer)
+
+        # Adjust the batch size offset to account for the micro-batch.
+        inference_params.batch_size_offset += this_micro_batch_size
+
+        # Copy logits.
+        if mpu.is_pipeline_last_stage():
+            logits[start:end, ...] = output
+
+    # Once we are done with all the micro-batches, we can
+    # adjust the sequence length offset.
+    inference_params.sequence_len_offset += sequence_length
+    # and reset the batch size offset
+    inference_params.batch_size_offset = 0
+
+    return logits
--- a/megatron/text_generation/generation.py
+++ b/megatron/text_generation/generation.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generation utilities."""
+
+import torch
+import torch.nn.functional as F
+
+from megatron import get_args, get_tokenizer, mpu
+from megatron.utils import get_ltor_masks_and_position_ids
+from .communication import (
+    copy_from_last_to_first_pipeline_stage,
+    broadcast_from_last_pipeline_stage,
+    broadcast_from_last_to_first_pipeline_stage)
+from .forward_step import ForwardStep
+from .sampling import sample
+
+def score_and_return_on_first_stage(model, tokens, lengths):
+    """Function for just scoring.
+    Arguments:
+        model: no interleaving is supported.
+        tokens: prompt tokens extended to be of size [b, max_prompt_length]
+        lengths: original prompt length, size: [b]
+    Note: Outside of model, other parameters only need to be available on
+          rank 0.
+    Outputs: 
+        output_log_probs: log probability of the selected tokens. size: [b, s]
+    """
+
+    args = get_args()
+
+    batch_size = tokens.size(0)
+    max_prompt_length = lengths.max().item()
+    assert max_prompt_length == tokens.size(1)
+    max_sequence_length = min(max_prompt_length, args.max_position_embeddings)
+
+    # forward step.
+    forward_step = ForwardStep(model, batch_size, max_sequence_length)
+
+    # ===================
+    # Pre-allocate memory
+    # ===================
+
+    # Log probability of the sequence (prompt + generated tokens).
+    output_log_probs = None
+    output_log_probs_size = (batch_size, max_sequence_length - 1)
+    
+    if mpu.is_pipeline_last_stage():
+        output_log_probs = torch.empty(output_log_probs_size,
+                                       dtype=torch.float32,
+                                       device=torch.cuda.current_device())
+    
+    # =============
+    # Run infernece
+    # =============
+    with torch.no_grad():
+        attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens)
+        
+        # logits will be meanigful only in the last pipeline stage.
+        logits = forward_step(tokens, position_ids, attention_mask)
+
+        if mpu.is_pipeline_last_stage():
+            # Always the last stage should have an output.
+            assert logits is not None
+            log_probs = F.log_softmax(logits, dim=2)
+            
+            # Pick the tokens that we need to get the log
+            # probabilities for. Note that next input token is
+            # the token which we selected in the current logits,
+            # so shift by 1.
+            indices = torch.unsqueeze(tokens[:, 1:], 2)
+            output_log_probs = torch.gather(log_probs, 2, indices).squeeze(2)
+    
+    # ======================================
+    # Broadcast to the first pipeline stage.
+    # ======================================
+    output_log_probs = broadcast_from_last_to_first_pipeline_stage(
+        output_log_probs_size, torch.float32, output_log_probs)
+    
+    return tokens, lengths, output_log_probs
+
+def generate_tokens_probs_and_return_on_first_stage(
+        model, tokens, lengths,
+        return_output_log_probs=False,
+        top_k=0, top_p=0.0,
+        temperature=1.0,
+        use_eod_token_for_early_termination=True):
+    """Main token generation function.
+    Arguments:
+        model: no interleaving is supported.
+        tokens: prompt tokens extended to be of size [b, max-sequence-length]
+        lengths: original prompt length, size: [b]
+        return_output_log_probs: flag to calculate the log probability of
+            the generated tokens. Note that the log probability is the one
+            from the original logit.
+        top_k, top_p: top-k and top-p sampling parameters.
+            Note that top-k = 1 is gready. Also, these paramters are
+            exclusive meaning that:
+                if top-k > 0 then we expect top-p=0.
+                if top-p > 0 then we check for top-k=0.
+        temperature: sampling temperature.
+        use_eod_token_for_early_termination: if True, do early termination if
+            all the sequences have reached this token.
+    Note: Outside of model, other parameters only need to be available on
+          rank 0.
+    Outputs: Note that is size is adjusted to a lower value than
+             max-sequence-length if generation is terminated early.
+        tokens: prompt and generated tokens. size: [b, :]
+        generated_sequence_lengths: total length (including prompt) of
+            the generated sequence. size: [b]
+        output_log_probs: log probability of the selected tokens. size: [b, s]
+    """
+
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    batch_size = tokens.size(0)
+    min_prompt_length = lengths.min().item()
+    max_sequence_length = tokens.size(1)
+    max_sequence_length = min(max_sequence_length, args.max_position_embeddings)
+
+    # forward step.
+    forward_step = ForwardStep(model, batch_size, max_sequence_length)
+
+    # Added termination_id to support the case that we want to terminate the
+    # generation once that id is generated.
+    if hasattr(args, 'eos_id'):
+        termination_id = args.eos_id
+    else:
+        termination_id = tokenizer.eod
+
+    # ===================
+    # Pre-allocate memory
+    # ===================
+
+    # Log probability of the sequence (prompt + generated tokens).
+    output_log_probs = None
+    output_log_probs_size = (batch_size, max_sequence_length - 1)
+    # Lengths of generated seuquence including including prompts.
+    generated_sequence_lengths = None
+    if mpu.is_pipeline_last_stage():
+        if return_output_log_probs:
+            output_log_probs = torch.empty(output_log_probs_size,
+                                           dtype=torch.float32,
+                                           device=torch.cuda.current_device())
+        generated_sequence_lengths = torch.ones(
+                batch_size, dtype=torch.int64,
+                device=torch.cuda.current_device()) * max_sequence_length
+    
+    # Whether we have reached a termination id.
+    is_generation_done = torch.zeros(batch_size, dtype=torch.uint8,
+                                     device=torch.cuda.current_device())
+
+    # =============
+    # Run infernece
+    # =============
+
+    with torch.no_grad():
+        attention_mask, position_ids = _build_attention_mask_and_position_ids(
+            tokens)
+        prev_context_length = 0
+        for context_length in range(min_prompt_length, max_sequence_length):
+
+            # Pick the slice that we need to pass through the network.
+            tokens2use = tokens[:, prev_context_length:context_length]
+            positions2use = position_ids[:, prev_context_length:context_length]
+            attention_mask2use = attention_mask[
+                ..., prev_context_length:context_length, :context_length]
+
+            # logits will be meanigful only in the last pipeline stage.
+            logits = forward_step(tokens2use, positions2use, attention_mask2use)
+
+            if mpu.is_pipeline_last_stage():
+                # Always the last stage should have an output.
+                assert logits is not None
+
+                # Sample.
+                last_token_logits = logits[:, -1, :]
+                new_sample = sample(last_token_logits,
+                                    top_k=top_k,
+                                    top_p=top_p,
+                                    temperature=temperature,
+                                    vocab_size=tokenizer.vocab_size)
+                # If a prompt length is smaller or equal th current context
+                # length, it means we have started generating tokens
+                started = lengths <= context_length
+                # Update the tokens.
+                tokens[started, context_length] = new_sample[started]
+
+                # Calculate the log probabilities.
+                if return_output_log_probs:
+                    log_probs = F.log_softmax(logits, dim=2)
+                    if return_output_log_probs:
+                        # Pick the tokens that we need to get the log
+                        # probabilities for. Note that next input token is
+                        # the token which we selected in the current logits,
+                        # so shift by 1.
+                        indices = torch.unsqueeze(
+                            tokens[
+                                :,
+                                (prev_context_length + 1):(context_length + 1)],
+                            2)
+                        output_log_probs[:,
+                                         prev_context_length:context_length] = \
+                            torch.gather(log_probs, 2, indices).squeeze(2)
+
+            # Update the tokens on the first stage so the next input to
+            # the network is correct.
+            copy_from_last_to_first_pipeline_stage(batch_size, torch.int64,
+                                                   tokens[:, context_length])
+
+            # Update the context length for the next token generation.
+            prev_context_length = context_length
+
+            # Check if all the sequences have hit the termination_id.
+            done = None
+            if mpu.is_pipeline_last_stage():
+                done_token = (new_sample == termination_id).byte() & \
+                    started.byte()
+                just_finished = (done_token & ~is_generation_done).bool()
+                generated_sequence_lengths[just_finished.view(-1)] = \
+                    context_length + 1
+                is_generation_done = is_generation_done | done_token
+                done = torch.all(is_generation_done)
+            done = broadcast_from_last_pipeline_stage(1, torch.uint8,
+                                                      tensor=done)
+            if use_eod_token_for_early_termination and done:
+                break
+
+    # ===================================================
+    # Update the length of based on max generated length.
+    # ===================================================
+
+    tokens = tokens[:, :(context_length + 1)]
+    if mpu.is_pipeline_last_stage():
+        if return_output_log_probs:
+            output_log_probs = output_log_probs[:, :context_length]
+
+    # ======================================
+    # Broadcast to the first pipeline stage.
+    # ======================================
+
+    generated_sequence_lengths = broadcast_from_last_to_first_pipeline_stage(
+        batch_size, torch.int64, generated_sequence_lengths)
+    if return_output_log_probs:
+        output_log_probs_size = (batch_size, context_length)
+        output_log_probs = broadcast_from_last_to_first_pipeline_stage(
+            output_log_probs_size, torch.float32, output_log_probs)
+
+    return tokens, generated_sequence_lengths, output_log_probs
+
+
+
+def _build_attention_mask_and_position_ids(tokens):
+    """Build the attention mask and postition ids for the input tokens."""
+
+    # Since we are not interested in loss-mask and reset attention/position
+    # is also False, eod_token is not used so it is safe to set it to None.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        data=tokens,
+        eod_token=None,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False)
+
+    return attention_mask, position_ids
--- a/megatron/text_generation/sampling.py
+++ b/megatron/text_generation/sampling.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sampling utilities.
+Part of this code is inspired by:
+ - https://github.com/ari-holtzman/degen/blob/master/gen.py
+ - https://huggingface.co/transformers/_modules/transformers/generation_logits_process.html
+"""
+
+
+import torch
+
+
+
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf."""
+
+    filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(filter_, float('-Inf'))
+
+
+
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf."""
+
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+    # Filteration based on the cumulative sum.
+    filter_ = cumulative_probs > top_p
+    # This shift by 1 is weird and I cannot justify it. This existed
+    # in the original implementation:
+    #   https://github.com/ari-holtzman/degen/blob/master/gen.py
+    # and I guess it is needed so keeping it for now.
+    filter_[:, 1:] = filter_[:, :-1].clone()
+    # Make sure we at least have one token to select from.
+    filter_[..., 0] = 0
+
+    # Fill in the filtered part
+    filter_ = filter_.scatter(1, sorted_indices, filter_)
+    logits.masked_fill_(filter_, float('-Inf'))
+
+
+
+def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None):
+    """ Sample and generate a token.
+    Note: logits has the dimension [b, v] where b is the batch size
+          and v is the vocabulary size.
+    If vocab_size is provided, we will make sure the sample that is
+    generated is in [0, vocab-size). This will avoid out of vocabulary
+    generations due to padding.
+    """
+
+    # Check logits for consistency.
+    assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.'
+    assert logits.type() == 'torch.cuda.FloatTensor', \
+        'input logits should be floats.'
+
+
+    # Greedy is just simple argmax.
+    if top_k == 1:
+        assert top_p == 0.0, 'cannot set both greedy and top-p samplings.'
+        samples = torch.argmax(logits, dim=-1)
+
+    # Top-k or top-p sampling.
+    else:
+        # Clone so we do not modify the inputs,
+        logits = logits.clone()
+        # Apply temperature in place.
+        if temperature != 1.0:
+            logits.div_(temperature)
+
+        if top_k > 1:
+            assert top_p == 0.0, 'cannot set both top-k and top-p samplings.'
+            assert top_k <= logits.size(1), 'top-k is larger than logit size.'
+            if vocab_size:
+                assert top_k < vocab_size, 'top-k is larger than vocab size.'
+            modify_logits_for_top_k_filtering(logits, top_k)
+
+        elif top_p > 0.0:
+            assert top_p <= 1.0, 'top-p should be in (0, 1].'
+            modify_logits_for_top_p_filtering(logits, top_p)
+
+        # After filtering, we need to recalculate the distribution.
+        probs = logits.softmax(dim=-1)
+        samples = torch.multinomial(probs, num_samples=1).view(-1)
+
+    # If vocab size is provided, make sure the samples are in
+    # in the range [0, vocab-size).
+    if vocab_size:
+        samples = torch.clamp(samples, min=0, max=(vocab_size - 1))
+
+    return samples
--- a/megatron/text_generation/tokenization.py
+++ b/megatron/text_generation/tokenization.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization utilities."""
+
+
+import torch
+
+
+from megatron import get_tokenizer
+from .communication import broadcast_int_list, broadcast_tensor
+
+
+def detokenize_generations(tokens_gpu_tensor,
+                           lengths_gpu_tensor,
+                           return_segments):
+    """Detokenize the generated tokens."""
+
+    tokenizer = get_tokenizer()
+
+    prompts_plus_generations = []
+    if return_segments:
+        prompts_plus_generations_segments = []
+
+    tokens = tokens_gpu_tensor.cpu().numpy().tolist()
+    lengths = lengths_gpu_tensor.cpu().numpy().tolist()
+    for sequence_tokens, length in zip(tokens, lengths):
+        sequence_tokens = sequence_tokens[:length]
+        prompts_plus_generations.append(
+            tokenizer.detokenize(sequence_tokens))
+        if return_segments:
+            words = []
+            for token in sequence_tokens:
+                word = tokenizer.tokenizer.decoder[token]
+                word = bytearray(
+                    [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
+                        'utf-8', errors='replace')
+                words.append(word)
+            prompts_plus_generations_segments.append(words)
+
+    if return_segments:
+        return tokens, prompts_plus_generations, \
+            prompts_plus_generations_segments
+
+    return tokens, prompts_plus_generations
+
+
+def tokenize_prompts(prompts=None, tokens_to_generate=None,
+                     add_BOS=None, rank=0):
+    """Tokenize prompts and make them avaiable on all ranks."""
+
+    # On all ranks set to None so we can pass them to functions
+    sizes_list = None
+    prompts_tokens_cuda_long_tensor = None
+    prompts_length_cuda_long_tensor = None
+
+    # On the specified rank, build the above.
+    if torch.distributed.get_rank() == rank:
+        assert prompts is not None
+        assert tokens_to_generate is not None
+        # Tensor of tokens padded and their unpadded length.
+        prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \
+            _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS)
+        # We need the sizes of these tensors for the boradcast
+        sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size
+                      prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
+
+    # First, broadcast the sizes.
+    sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank)
+
+    # Now that we have the sizes, we can boradcast the tokens
+    # and length tensors.
+    sizes = sizes_tensor.tolist()
+    prompts_tokens_cuda_long_tensor = broadcast_tensor(
+        sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank)
+    prompts_length_cuda_long_tensor = broadcast_tensor(
+        sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor,
+        rank=rank)
+
+    return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
+
+
+def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS):
+    """Given a set of prompts and number of tokens to generate:
+        - tokenize prompts
+        - set the sequence length to be the max of length of prompts
+          plus the number of tokens we would like to generate
+        - pad all the sequences to this length so we can convert them
+          into a 2D tensor.
+    """
+
+    # Tokenize all the prompts.
+    tokenizer = get_tokenizer()
+    if add_BOS:
+        prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt)
+                          for prompt in prompts]
+    else:
+        prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
+
+    # Now we have a list of list of tokens which each list has a different
+    # size. We want to extend this list to:
+    #   - incorporate the tokens that need to be generated
+    #   - make all the sequences equal length.
+    # Get the prompts length.
+    prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
+    # Get the max prompts length.
+    max_prompt_len = max(prompts_length)
+    # Number of tokens in the each sample of the batch.
+    samples_length = max_prompt_len + tokens_to_generate
+    # Now update the list of list to be of the same size: samples_length.
+    for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
+        padding_size = samples_length - prompt_length
+        prompt_tokens.extend([tokenizer.eod] * padding_size)
+
+    # Now we are in a structured format, we can convert to tensors.
+    prompts_tokens_tensor = torch.cuda.LongTensor(prompts_tokens)
+    prompts_length_tensor = torch.cuda.LongTensor(prompts_length)
+
+    return prompts_tokens_tensor, prompts_length_tensor
--- a/megatron/text_generation_server.py
+++ b/megatron/text_generation_server.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import torch
+import json
+import threading
+from flask import Flask, request, jsonify, current_app
+from flask_restful import Resource, Api
+from megatron import get_args
+from megatron.text_generation import generate_and_post_process
+
+
+GENERATE_NUM = 0
+lock = threading.Lock()
+
+class MegatronGenerate(Resource):
+    def __init__(self, model):
+        self.model = model
+
+    @staticmethod
+    def send_do_generate():
+        choice = torch.cuda.LongTensor([GENERATE_NUM])
+        torch.distributed.broadcast(choice, 0)
+     
+    def put(self):
+        args = get_args()
+        print("request IP: " + str(request.remote_addr))
+        print(json.dumps(request.get_json()),flush=True)
+        print("current time: ", datetime.datetime.now())
+       
+        if not "prompts" in request.get_json():
+            return "prompts argument required", 400
+        
+        if "max_len" in request.get_json():
+            return "max_len is no longer used.  Replace with tokens_to_generate", 400
+        
+        if "sentences" in request.get_json():
+            return "sentences is no longer used.  Replace with prompts", 400
+
+        prompts = request.get_json()["prompts"]
+        if len(prompts) > 128:
+            return "Maximum number of prompts is 128", 400
+
+        tokens_to_generate = 64  # Choosing hopefully sane default.  Full sequence is slow
+        if "tokens_to_generate" in request.get_json():
+            tokens_to_generate = request.get_json()["tokens_to_generate"]
+            if not isinstance(tokens_to_generate, int):
+                return "tokens_to_generate must be an integer greater than 0"
+            if tokens_to_generate < 0:
+                return "tokens_to_generate must be an integer greater than or equal to 0"
+
+        logprobs = False
+        if "logprobs" in request.get_json():
+            logprobs = request.get_json()["logprobs"]
+            if not isinstance(logprobs, bool):
+                return "logprobs must be a boolean value"
+        
+        if tokens_to_generate == 0 and not logprobs:
+            return "tokens_to_generate=0 implies logprobs should be True"
+        
+        temperature = 1.0
+        if "temperature" in request.get_json():
+            temperature = request.get_json()["temperature"]
+            if not (type(temperature) == int or type(temperature) == float):
+                return "temperature must be a positive number less than or equal to 100.0"
+            if not (0.0 < temperature <= 100.0):
+                return "temperature must be a positive number less than or equal to 100.0"
+        
+        top_k = 0.0
+        if "top_k" in request.get_json():
+            top_k = request.get_json()["top_k"]
+            if not (type(top_k) == int):
+                return "top_k must be an integer equal to or greater than 0 and less than or equal to 1000"
+            if not (0 <= top_k <= 1000):
+                return "top_k must be equal to or greater than 0 and less than or equal to 1000"
+        
+        top_p = 0.0
+        if "top_p" in request.get_json():
+            top_p = request.get_json()["top_p"]
+            if not (type(top_p) == float):
+                return "top_p must be a positive float less than or equal to 1.0"
+            if top_p > 0.0 and top_k > 0.0:
+                return "cannot set both top-k and top-p samplings."
+            if not (0 <= top_p <= 1.0):
+                return "top_p must be less than or equal to 1.0"
+        
+        add_BOS = False
+        if "add_BOS" in request.get_json():
+            add_BOS = request.get_json()["add_BOS"]
+            if not isinstance(add_BOS, bool):
+                return "add_BOS must be a boolean value"
+
+        with lock:  # Need to get lock to keep multiple threads from hitting code
+            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
+            response, response_seg, response_logprobs, _ = \
+                generate_and_post_process(
+                    self.model,
+                    prompts=prompts,
+                    tokens_to_generate=tokens_to_generate,
+                    return_output_log_probs=logprobs,
+                    top_k_sampling=top_k,
+                    top_p_sampling=top_p,
+                    temperature=temperature,
+                    add_BOS=add_BOS,
+                    use_eod_token_for_early_termination=True)
+        
+        return jsonify({"text": response,
+            "segments": response_seg,
+            "logprobs": response_logprobs})
+
+class MegatronServer(object):
+    def __init__(self, model):
+        self.app = Flask(__name__, static_url_path='')
+        api = Api(self.app)
+        api.add_resource(MegatronGenerate, '/api', resource_class_args=[model])
+        
+    def run(self, url): 
+        self.app.run(url, threaded=True, debug=False)
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utilities for generating text."""
-
-import copy
-import json
-import os
-import time
-
-import torch
-import torch.nn.functional as F
-
-from megatron import get_args
-from megatron import print_rank_0
-from megatron import get_tokenizer
-from megatron import mpu
-from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
-from megatron.p2p_communication import recv_forward, send_forward
-
-# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron.model import DistributedDataParallel as LocalDDP
-from megatron.model import Float16Module
-
-def get_batch(context_tokens):
-    """Generate batch from context tokens."""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    # Move to GPU.
-    tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().cuda()
-    # Get the attention mask and postition ids.
-    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
-
-    return tokens, attention_mask, position_ids
-
-
-def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
-    """ This function has been mostly taken from huggingface conversational
-     ai code at
-         https://medium.com/huggingface/how-to-build-a-state-of-the-art-
-              conversational-ai-with-transfer-learning-2d818ac26313 """
-
-    if top_k > 0:
-        # Remove all tokens with a probability less than the
-        # last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        logits[indices_to_remove] = filter_value
-
-    if top_p > 0.0:
-        # Cconvert to 1D
-        sorted_logits, sorted_indices = torch.sort(
-            logits, descending=True, dim=-1)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1),
-                                        dim=-1)
-
-        # Remove tokens with cumulative probability above the threshold
-        sorted_indices_to_remove = cumulative_probs > top_p
-        # Shift the indices to the right to keep also the first token
-        # above the threshold
-        sorted_indices_to_remove[..., 1:] \
-            = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-        for i in range(sorted_indices.size(0)):
-            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
-            logits[i][indices_to_remove] = filter_value
-
-    return logits
-
-
-def generate_samples_input_from_file(model):
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    # Read the sample file and open the output file.
-    assert args.sample_input_file is not None, \
-        'sample input file is not provided.'
-    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-        fname = open(args.sample_input_file, "r")
-        all_raw_text = fname.readlines()
-        input_count = len(all_raw_text)
-        input_pos = 0
-        if args.sample_output_file is None:
-            sample_output_file = args.sample_input_file + ".out"
-            print('`sample-output-file` not specified, setting '
-                  'it to {}'.format(sample_output_file))
-        else:
-            sample_output_file = args.sample_output_file
-        fname_out = open(sample_output_file, "w+")
-
-    context_count = 0
-    model.eval()
-    with torch.no_grad():
-        while True:
-            terminate_runs = 0
-            raw_text_len = 0
-
-            if mpu.is_pipeline_first_stage() \
-               and mpu.get_tensor_model_parallel_rank() == 0:
-                raw_text = all_raw_text[input_pos]
-                input_pos += 1
-                if input_pos == input_count:
-                    raw_text = "stop"
-                raw_text_len = len(raw_text)
-
-                if "stop" in raw_text:
-                    terminate_runs = 1
-                else:
-                    context_tokens = tokenizer.tokenize(raw_text)
-                    context_length = len(context_tokens)
-
-                    if context_length >= (args.seq_length // 2):
-                        print("\nContext length", context_length,
-                              "\nPlease give smaller context (half of the "
-                              "sequence length)!", flush=True)
-                        continue
-            else:
-                context_tokens = tokenizer.tokenize("EMPTY TEXT")
-                context_length = 0
-
-            input_info = [terminate_runs, raw_text_len, context_length]
-            input_info_tensor = torch.cuda.LongTensor(input_info)
-            torch.distributed.all_reduce(input_info_tensor,
-                                         group=mpu.get_model_parallel_group())
-            terminate_runs = input_info_tensor[0].item()
-            raw_text_len = input_info_tensor[1].item()
-            context_length = input_info_tensor[2].item()
-
-            if terminate_runs == 1:
-                return
-
-            # For pipeline parallel we send context tokens to other stages
-            # so they get the lengths correct
-            if mpu.get_tensor_model_parallel_rank() == 0 \
-               and args.pipeline_model_parallel_size > 1:
-                if mpu.is_pipeline_first_stage():
-                    src = mpu.get_pipeline_model_parallel_first_rank()
-                    group = mpu.get_pipeline_model_parallel_group()
-                    context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
-                    torch.distributed.broadcast(context_tokens_tensor, src, group)
-                else:
-                    src = mpu.get_pipeline_model_parallel_first_rank()
-                    group = mpu.get_pipeline_model_parallel_group()
-                    context_tokens_tensor = torch.empty(context_length,
-                                                        dtype=torch.int64,
-                                                        device=torch.device("cuda"))
-                    torch.distributed.broadcast(context_tokens_tensor, src, group)
-                    context_tokens = context_tokens_tensor.cpu().numpy().tolist()
-
-            token_stream = get_token_stream(model, [context_tokens])
-            for _, decode_tokens in enumerate(token_stream):
-                pass
-
-            if mpu.get_tensor_model_parallel_rank() == 0:
-                if mpu.is_pipeline_first_stage():
-                    os.system('clear')
-                    print("\nContext:", raw_text, flush=True)
-
-                    fname_out.write("\nContext:")
-                    fname_out.write(raw_text)
-
-                    decode_tokens, _ = decode_tokens
-                    decode_tokens = decode_tokens[0].cpu().numpy().tolist()
-                    trim_decode_tokens = tokenizer.detokenize(
-                        decode_tokens)[raw_text_len:]
-                    print("\nMegatron-LM:", trim_decode_tokens, flush=True)
-
-                    fname_out.write("\n\nMegatron-LM:")
-                    fname_out.write(trim_decode_tokens)
-                    fname_out.write("\n")
-
-            raw_text = None
-            context_count += 1
-
-
-# We added this function to support the tasks evaluation such as squad
-# and drop in the https://github.com/EleutherAI/lm-evaluation-harness 
-# codebase. The lm-evaluation-harness code can now call this function
-# similar to their current generate function call used for gpt style models.
-def generate_samples_eval(model, context, max_gen_length, eos_token_id):
-    # Generate samples for lm evaluation
-    # NEED TO THINK ABOUT eos token
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    raw_text_len = len(context)
-    model.eval()
-
-    context_tokens = tokenizer.tokenize(context)
-    args.out_seq_length = max_gen_length + len(context_tokens)
-    args.eos_id = eos_token_id
-
-    with torch.no_grad():
-        token_stream = get_token_stream(model, [context_tokens])
-        for counter, decode_tokens in enumerate(token_stream):
-            if counter == args.out_seq_length:
-                break
-
-    decode_tokens, _ = decode_tokens
-    decode_tokens = decode_tokens[0].cpu().numpy().tolist()
-    trim_decode_tokens = tokenizer.detokenize(
-        decode_tokens)[raw_text_len:]
- 
-    return trim_decode_tokens
-
-
-def generate_samples_interactive(model, print_frequency=24):
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    context_count = 0
-    model.eval()
-    with torch.no_grad():
-        while True:
-            terminate_runs = 0
-            raw_text_len = 0
-
-            if mpu.is_pipeline_first_stage() \
-               and mpu.get_tensor_model_parallel_rank() == 0:
-                os.system('clear')
-                raw_text = input("\nContext prompt (stop to exit) >>> ")
-                while not raw_text:
-                    print('Prompt should not be empty!')
-                    raw_text = input("\nContext prompt (stop to exit) >>> ")
-                raw_text_len = len(raw_text)
-
-                if "stop" in raw_text:
-                    terminate_runs = 1
-                else:
-                    context_tokens = tokenizer.tokenize(raw_text)
-                    # context_tokens = context_tokens + [tokenizer.sep_id]
-                    context_length = len(context_tokens)
-
-                    if context_length >= (args.seq_length // 2):
-                        print("\nContext length", context_length,
-                              "\nPlease give smaller context (half of the "
-                              "sequence length)!", flush=True)
-                        continue
-            else:
-                context_tokens = tokenizer.tokenize("EMPTY TEXT")
-                context_length = 0
-
-            input_info = [terminate_runs, raw_text_len, context_length]
-            input_info_tensor = torch.cuda.LongTensor(input_info)
-            torch.distributed.all_reduce(input_info_tensor,
-                                         group=mpu.get_model_parallel_group())
-            terminate_runs = input_info_tensor[0].item()
-            raw_text_len = input_info_tensor[1].item()
-            context_length = input_info_tensor[2].item()
-
-            if terminate_runs == 1:
-                return
-
-            # For pipeline parallel we send context tokens to other stages
-            # so they get the lengths correct
-            if mpu.get_tensor_model_parallel_rank() == 0 \
-               and args.pipeline_model_parallel_size > 1:
-                if mpu.is_pipeline_first_stage():
-                    src = mpu.get_pipeline_model_parallel_first_rank()
-                    group = mpu.get_pipeline_model_parallel_group()
-                    context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
-                    torch.distributed.broadcast(context_tokens_tensor, src, group)
-                else:
-                    src = mpu.get_pipeline_model_parallel_first_rank()
-                    group = mpu.get_pipeline_model_parallel_group()
-                    context_tokens_tensor = torch.empty(context_length,
-                                                        dtype=torch.int64,
-                                                        device=torch.device("cuda"))
-                    torch.distributed.broadcast(context_tokens_tensor, src, group)
-                    context_tokens = context_tokens_tensor.cpu().numpy().tolist()
-
-            token_stream = get_token_stream(model, [context_tokens])
-
-            for counter, decode_tokens in enumerate(token_stream):
-                if counter % print_frequency != 0 \
-                   or mpu.get_tensor_model_parallel_rank() != 0 \
-                   or not mpu.is_pipeline_first_stage():
-                    continue
-
-                os.system('clear')
-                print("\nContext:", raw_text, flush=True)
-
-                decode_tokens, _ = decode_tokens
-                # print("tokenzied inputs:", tokenizer.tokenize(raw_text))
-                # print("decode_tokens:", decode_tokens)
-
-                decode_tokens = decode_tokens[0].cpu().numpy().tolist()
-                trim_decode_tokens = tokenizer.detokenize(
-                    decode_tokens)[raw_text_len:]
-                # trim_decode_tokens = tokenizer.detokenize(
-                #     decode_tokens[context_length:])
-                print("\nMegatron-LM:", trim_decode_tokens, flush=True)
-
-            if mpu.is_pipeline_first_stage() \
-               and mpu.get_tensor_model_parallel_rank() == 0:
-                os.system('clear')
-                print("\nContext:", raw_text, flush=True)
-
-                if not isinstance(decode_tokens, list):
-                    decode_tokens, _ = decode_tokens
-                    decode_tokens = decode_tokens[0].cpu().numpy().tolist()
-                trim_decode_tokens = tokenizer.detokenize(
-                    decode_tokens)[raw_text_len:]
-                # print("decode_tokens:", decode_tokens)
-                # trim_decode_tokens = tokenizer.detokenize(
-                #     decode_tokens[context_length:])
-                print("\nMegatron-LM:", trim_decode_tokens, flush=True)
-
-                input("\nPress Enter to continue >>>")
-
-            raw_text = None
-            context_count += 1
-
-
-
-def generate_samples_unconditional(model):
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    num_samples = args.num_samples
-    context_tokens = [[tokenizer.eod]
-                      for _ in range(args.micro_batch_size)]
-    ctr = 0
-    while True:
-        start_time = time.time()
-        for token_stream in get_token_stream(model,
-                                             copy.deepcopy(context_tokens)):
-            pass
-        if mpu.is_pipeline_last_stage() and \
-           mpu.get_tensor_model_parallel_rank() == 0:
-            if ctr % args.log_interval == 0:
-                print('Avg s/batch:',
-                      (time.time() - start_time) / min(args.log_interval, ctr + 1))
-                start_time = time.time()
-            length = len(token_stream)
-            token_batch = token_stream[0].cpu().numpy().tolist()
-            length_batch = token_stream[1].cpu().numpy().tolist()
-            assert len(length_batch) == args.micro_batch_size
-            for tokens, length in zip(token_batch, length_batch):
-                tokens = tokens[1:length - 1]
-                text = tokenizer.detokenize(tokens)
-                is_finished = length < args.seq_length - 1
-                datum = {'text': text, 'length': length - 1, 'finished': is_finished}
-                yield datum
-                ctr += 1
-                if ctr >= num_samples:
-                    break
-        else:
-            for _ in range(args.micro_batch_size):
-                yield None
-                ctr += 1
-                if ctr >= num_samples:
-                    break
-        if ctr >= num_samples:
-            break
-
-
-def generate_and_write_samples_unconditional(model):
-
-    args = get_args()
-    assert args.genfile is not None
-    with open(args.genfile, 'w') as f:
-        for datum in generate_samples_unconditional(model):
-            if mpu.is_pipeline_last_stage() and \
-               mpu.get_tensor_model_parallel_rank() == 0:
-                f.write(json.dumps(datum) + '\n')
-
-
-def pad_batch(batch, pad_id, args):
-
-    context_lengths = []
-    for tokens in batch:
-        context_length = len(tokens)
-        if context_length < args.seq_length:
-            tokens.extend([pad_id] * (args.seq_length - context_length))
-        context_lengths.append(context_length)
-    return batch, context_lengths
-
-
-def get_token_stream(model, context_tokens):
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    context_tokens, context_lengths = pad_batch(context_tokens,
-                                                tokenizer.eod, args)
-
-    context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
-    context_length_tensor = torch.cuda.LongTensor(context_lengths)
-
-    torch.distributed.broadcast(context_length_tensor,
-                                mpu.get_tensor_model_parallel_src_rank(),
-                                group=mpu.get_tensor_model_parallel_group())
-    torch.distributed.broadcast(context_tokens_tensor,
-                                mpu.get_tensor_model_parallel_src_rank(),
-                                group=mpu.get_tensor_model_parallel_group())
-
-    context_length = context_length_tensor.min().item()
-    tokens, attention_mask, position_ids = get_batch(context_tokens_tensor)
-
-    batch_token_iterator = sample_sequence_batch(model, context_tokens_tensor,
-                                                 context_length_tensor,
-                                                 attention_mask, position_ids)
-    for tokens, lengths in batch_token_iterator:
-        context_length += 1
-        if tokens is not None:
-            yield tokens[:, :context_length], lengths
-        else:
-            yield None, None
-
-
-def switch(val1, val2, boolean):
-
-    boolean = boolean.type_as(val1)
-    return (1 - boolean) * val1 + boolean * val2
-
-
-def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids,
-                 layer_past=None, get_key_value=None,
-                 forward_method_parallel_output=None):
-
-    # Hidden size changes when not using recompute, need to tell p2p_communicate
-    # functions the correct size
-    args = get_args()
-    orig_seq_length = args.seq_length
-    args.seq_length = tokens.shape[1]
-
-    input_tensor = recv_forward()
-
-    # Forward pass through the model.
-    unwrapped_model = unwrap_model(
-        model, (torchDDP, LocalDDP, Float16Module))
-    unwrapped_model.set_input_tensor(input_tensor)
-    output_tensor = model(tokens, position_ids, attention_mask,
-                          tokentype_ids=tokentype_ids,
-                          layer_past=layer_past,
-                          get_key_value=get_key_value,
-                          forward_method_parallel_output=forward_method_parallel_output)
-
-    if get_key_value:
-        output_tensor, layer_past = output_tensor
-
-    send_forward(output_tensor)
-
-    args.seq_length = orig_seq_length
-    if get_key_value:
-        return output_tensor, layer_past
-    return output_tensor
-
-
-def sample_sequence_batch(model, context_tokens, context_lengths,
-                          attention_mask, position_ids,
-                          maxlen=None, type_ids=None):
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    model.eval()
-    with torch.no_grad():
-        context_length = context_lengths.min().item()
-
-        # added eos_id to support the function generate_samples_eval that passes
-        # eos_id as an argument and needs termination when that id id found.
-        if hasattr(args, 'eos_id'):
-            eos_id = args.eos_id
-        else:
-            eos_id = tokenizer.eod
-
-        counter = 0
-        org_context_length = context_length
-
-        layer_past = None
-        batch_size = context_tokens.size(0)
-        is_done = torch.zeros([batch_size]).byte().cuda()
-        tokens = context_tokens
-        if maxlen is None:
-            maxlen = args.seq_length - 1
-            if maxlen > (org_context_length + args.out_seq_length):
-                maxlen = org_context_length + args.out_seq_length
-
-        lengths = torch.ones([batch_size]).long().cuda() * maxlen
-
-        while context_length <= (maxlen):
-            if args.recompute:
-                output = forward_step(model, tokens,
-                                      position_ids,
-                                      attention_mask,
-                                      tokentype_ids=type_ids,
-                                      forward_method_parallel_output=False)
-                if mpu.is_pipeline_last_stage():
-                    assert output is not None
-                    logits = output[:, context_length - 1, :]
-            else:
-                types2use = None
-                if counter == 0:
-                    tokens2use = tokens[:, :context_length]
-                    positions2use = position_ids[:, :context_length]
-                    if type_ids is not None:
-                        types2use = type_ids[:, :context_length]
-                else:
-                    tokens2use = tokens[:, context_length - 1].view(
-                        batch_size, -1)
-                    positions2use = position_ids[:, context_length - 1].view(
-                        batch_size, -1)
-                    if type_ids is not None:
-                        types2use = type_ids[:, context_length - 1].view(
-                            batch_size, -1)
-                output, layer_past = forward_step(model, tokens2use,
-                                                  positions2use,
-                                                  attention_mask,
-                                                  layer_past=layer_past,
-                                                  get_key_value=True,
-                                                  tokentype_ids=types2use,
-                                                  forward_method_parallel_output=False)
-                if mpu.is_pipeline_last_stage():
-                    assert output is not None
-                    logits = output[:, -1].view(batch_size, -1).contiguous()
-
-            if mpu.is_pipeline_last_stage():
-                if args.greedy:
-                    prev = torch.argmax(logits, dim=-1).view(-1)
-                else:
-                    logits = logits.float()
-                    logits /= args.temperature
-                    logits = top_k_logits(logits, top_k=args.top_k,
-                                          top_p=args.top_p)
-                    log_probs = F.softmax(logits, dim=-1)
-                    prev = torch.multinomial(log_probs, num_samples=1).view(-1)
-
-                started = context_lengths <= context_length
-
-                new_tokens = switch(
-                    tokens[:, context_length].view(-1), prev, started)
-                tokens[:, context_length] = new_tokens
-                src = mpu.get_pipeline_model_parallel_last_rank()
-                group = mpu.get_embedding_group()
-                torch.distributed.broadcast(new_tokens, src, group)
-
-                done_token = (prev == eos_id).byte() & started.byte()
-                just_finished = (done_token & ~is_done).bool()
-                lengths[just_finished.view(-1)] = context_length
-                is_done = is_done | done_token
-
-                done = torch.all(is_done)
-                src = mpu.get_pipeline_model_parallel_last_rank()
-                group = mpu.get_pipeline_model_parallel_group()
-                torch.distributed.broadcast(done, src, group)
-                yield tokens, lengths
-
-            else:
-                if mpu.is_pipeline_first_stage():
-                    src = mpu.get_pipeline_model_parallel_last_rank()
-                    group = mpu.get_embedding_group()
-                    new_tokens = torch.empty_like(tokens[:, context_length])
-                    torch.distributed.broadcast(new_tokens, src, group)
-                    tokens[:, context_length] = new_tokens
-                    yield tokens, None
-                else:
-                    yield None, None
-
-                done = torch.cuda.ByteTensor([0])
-                src = mpu.get_pipeline_model_parallel_last_rank()
-                group = mpu.get_pipeline_model_parallel_group()
-                torch.distributed.broadcast(done, src, group)
-
-            context_length += 1
-            counter += 1
-            if done:
-                break
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -266,7 +266,7 @@ class _GPT2BPETokenizer(AbstractTokenizer):
        super().__init__(name)

        if special_tokens is not None:
-            # for controllable dialog, special_tokens: "[SEP],[CTRL],[PAD]"
+            # special_tokens: "[SEP],[PAD]"
            special_tokens = special_tokens.split(",")
        else:
            special_tokens = []
@@ -277,8 +277,6 @@ class _GPT2BPETokenizer(AbstractTokenizer):
        if special_tokens is not None and len(special_tokens) > 0:
            if "[SEP]" in special_tokens:
                self.sep_id = self.tokenizer.special_tokens['[SEP]']
-            if "[CTRL]" in special_tokens:
-                self.ctrl_id = self.tokenizer.special_tokens['[CTRL]']
            if "[PAD]" in special_tokens:
                self.pad_id = self.tokenizer.special_tokens['[PAD]']


--- a/megatron/training.py
+++ b/megatron/training.py
@@ -26,6 +26,7 @@ import torch
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP

 from megatron import get_args
+from megatron import get_signal_handler
 from megatron import get_timers
 from megatron import get_tensorboard_writer
 from megatron import get_current_global_batch_size
@@ -38,6 +39,7 @@ from megatron import print_rank_last
 from megatron.checkpointing import load_checkpoint
 from megatron.checkpointing import save_checkpoint
 from megatron.model import Float16Module
+from megatron.model import ModelType
 from megatron.optimizer import get_megatron_optimizer
 from megatron.initialize import initialize_megatron
 from megatron.initialize import write_args_to_tensorboard
@@ -47,9 +49,7 @@ from megatron.utils import check_adlr_autoresume_termination
 from megatron.utils import unwrap_model
 from megatron.data.data_samplers import build_pretraining_data_loader
 from megatron.utils import calc_params_l2_norm
-from megatron.schedules import forward_backward_no_pipelining
-from megatron.schedules import forward_backward_pipelining_without_interleaving
-from megatron.schedules import forward_backward_pipelining_with_interleaving
+from megatron.schedules import get_forward_backward_func
 from megatron.utils import report_memory


@@ -62,6 +62,7 @@ def print_datetime(string):

 def pretrain(train_valid_test_dataset_provider,
             model_provider,
+             model_type,
             forward_step_func,
             extra_args_provider=None,
             args_defaults={}):
@@ -78,6 +79,7 @@ def pretrain(train_valid_test_dataset_provider,
            train/valid/test dataset and returns `train, valid, test` datasets.
        model_provider: a function that returns a vanilla version of the
            model. By vanilla we mean a simple model on cpu with no fp16 or ddp.
+        model_type: an enum that specifies the type of model being trained.
        forward_step_func: a function that takes a `data iterator` and `model`,
            and returns a `loss` scalar with a dictionary with key:values being
            the info we would like to monitor during training, for example
@@ -97,7 +99,7 @@ def pretrain(train_valid_test_dataset_provider,
    # This will be closer to what scheduler will see (outside of
    # image ... launches.
    global _TRAIN_START_TIME
-    start_time_tensor = torch.cuda.FloatTensor([_TRAIN_START_TIME])
+    start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME])
    torch.distributed.all_reduce(start_time_tensor,
                                 op=torch.distributed.ReduceOp.MIN)
    _TRAIN_START_TIME = start_time_tensor.item()
@@ -110,7 +112,8 @@ def pretrain(train_valid_test_dataset_provider,

    # Model, optimizer, and learning rate.
    timers('model-and-optimizer-setup').start()
-    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider,
+                                                               model_type)
    timers('model-and-optimizer-setup').stop()
    print_datetime('after model, optimizer, and learning rate '
                   'scheduler are built')
@@ -141,15 +144,15 @@ def pretrain(train_valid_test_dataset_provider,
    # if not args.run_dialog:
    if args.do_train and args.train_iters > 0:
        iteration = train(forward_step_func,
-                        model, optimizer, lr_scheduler,
-                        train_data_iterator, valid_data_iterator)
+                          model, optimizer, lr_scheduler,
+                          train_data_iterator, valid_data_iterator)
    print_datetime('after training is done')

    if args.do_valid:
        prefix = 'the end of training for val data'
        evaluate_and_print_results(prefix, forward_step_func,
-                                valid_data_iterator, model,
-                                iteration, False)
+                                   valid_data_iterator, model,
+                                   iteration, False)

    if args.save and iteration != 0:
        save_checkpoint(iteration, model, optimizer, lr_scheduler)
@@ -158,41 +161,9 @@ def pretrain(train_valid_test_dataset_provider,
        # Run on test data.
        prefix = 'the end of training for test data'
        evaluate_and_print_results(prefix, forward_step_func,
-                                test_data_iterator, model,
-                                0, True)
-    
-    # else:
-    #     # training for dialog/control model
-    #     timers('interval-time').start() # start timers('interval-time') here to avoid it from starting multiple times
-    #     for e in range(args.num_epoch):
-    #         print_rank_0('> training on epoch %d' % (e+1))
-
-    #         if args.do_train and args.train_iters > 0:
-    #             iteration += train(forward_step_func,
-    #                             model, optimizer, lr_scheduler,
-    #                             train_data_iterator, valid_data_iterator)
-    #         print_datetime('after training is done')
-
-    #         if args.do_valid:
-    #             prefix = 'the end of training for val data'
-    #             evaluate_and_print_results(prefix, forward_step_func,
-    #                                     valid_data_iterator, model,
-    #                                     iteration, False)
-
-    #         # if args.train_module == "dialog":
-    #         #     if (e+1) >= 6 and (e+1) <= 15 and args.save and iteration != 0:
-    #         #         save_checkpoint(iteration, model, optimizer, lr_scheduler)
-    #         if args.train_module == "control":
-    #             if (e+1) >= 5 and (e+1) <= 9 and args.save and iteration != 0:
-    #                 save_checkpoint(iteration, model, optimizer, lr_scheduler)
-
-    #         if args.do_test:
-    #             # Run on test data.
-    #             prefix = 'the end of training for test data'
-    #             evaluate_and_print_results(prefix, forward_step_func,
-    #                                     test_data_iterator, model,
-    #                                     0, True)
-
+                                   test_data_iterator, model,
+                                   0, True)
+  
 def update_train_iters(args):

    # For iteration-based training, we don't need to do anything
@@ -223,13 +194,16 @@ def update_train_iters(args):
    print_rank_0('setting training iterations to {}'.format(args.train_iters))


-def get_model(model_provider_func):
+def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True):
    """Build the model."""
    args = get_args()
+    args.model_type = model_type

    # Build model.
    if mpu.get_pipeline_model_parallel_world_size() > 1 and \
       args.virtual_pipeline_model_parallel_size is not None:
+        assert model_type != ModelType.encoder_and_decoder, \
+            "Interleaved schedule not supported for model with both encoder and decoder"
        model = []
        for i in range(args.virtual_pipeline_model_parallel_size):
            mpu.set_virtual_pipeline_model_parallel_rank(i)
@@ -240,14 +214,36 @@ def get_model(model_provider_func):
                pre_process=pre_process,
                post_process=post_process
            )
+            this_model.model_type = model_type
            model.append(this_model)
    else:
        pre_process = mpu.is_pipeline_first_stage()
        post_process = mpu.is_pipeline_last_stage()
-        model = model_provider_func(
-            pre_process=pre_process,
-            post_process=post_process
-        )
+        add_encoder = True
+        add_decoder = True
+        if model_type == ModelType.encoder_and_decoder:
+            if mpu.get_pipeline_model_parallel_world_size() > 1:
+                assert args.pipeline_model_parallel_split_rank is not None, \
+                    "Split rank needs to be specified for model with both encoder and decoder"
+                rank = mpu.get_pipeline_model_parallel_rank()
+                split_rank = args.pipeline_model_parallel_split_rank
+                world_size = mpu.get_pipeline_model_parallel_world_size()
+                pre_process = rank == 0 or rank == split_rank
+                post_process = (rank == (split_rank - 1)) or (
+                        rank == (world_size - 1))
+                add_encoder = mpu.is_pipeline_stage_before_split()
+                add_decoder = mpu.is_pipeline_stage_after_split()
+            model = model_provider_func(
+                pre_process=pre_process,
+                post_process=post_process,
+                add_encoder=add_encoder,
+                add_decoder=add_decoder)
+        else:
+            model = model_provider_func(
+                pre_process=pre_process,
+                post_process=post_process
+            )
+        model.model_type = model_type

    if not isinstance(model, list):
        model = [model]
@@ -277,22 +273,24 @@ def get_model(model_provider_func):
    if args.fp16 or args.bf16:
        model = [Float16Module(model_module, args) for model_module in model]

-    if args.DDP_impl == 'torch':
-        i = torch.cuda.current_device()
-        model = [torchDDP(model_module, device_ids=[i], output_device=i,
-                          process_group=mpu.get_data_parallel_group())
-                 for model_module in model]
-        return model
+    if wrap_with_ddp:
+        if args.DDP_impl == 'torch':
+            i = torch.cuda.current_device()
+            model = [torchDDP(model_module, device_ids=[i], output_device=i,
+                              process_group=mpu.get_data_parallel_group())
+                     for model_module in model]

-    if args.DDP_impl == 'local':
-        model = [LocalDDP(model_module,
-                          args.accumulate_allreduce_grads_in_fp32,
-                          args.use_contiguous_buffers_in_ddp)
-                 for model_module in model]
-        return model
+        elif args.DDP_impl == 'local':
+            model = [LocalDDP(model_module,
+                              args.accumulate_allreduce_grads_in_fp32,
+                              args.use_contiguous_buffers_in_local_ddp)
+                     for model_module in model]

-    raise NotImplementedError('Unknown DDP implementation specified: {}. '
-                              'Exiting.'.format(args.DDP_impl))
+        else:
+            raise NotImplementedError('Unknown DDP implementation specified: '
+                                      '{}. Exiting.'.format(args.DDP_impl))
+
+    return model


 def get_learning_rate_scheduler(optimizer):
@@ -338,11 +336,11 @@ def get_learning_rate_scheduler(optimizer):
    return lr_scheduler


-def setup_model_and_optimizer(model_provider_func):
+def setup_model_and_optimizer(model_provider_func, model_type):
    """Setup model and optimizer."""
    args = get_args()

-    model = get_model(model_provider_func)
+    model = get_model(model_provider_func, model_type)

    unwrapped_model = unwrap_model(model,
                                   (torchDDP, LocalDDP, Float16Module))
@@ -387,26 +385,20 @@ def train_step(forward_step_func, data_iterator,
    timers = get_timers()

    # Set grad to zero.
-    if args.DDP_impl == 'local' and args.use_contiguous_buffers_in_ddp:
+    if args.DDP_impl == 'local' and args.use_contiguous_buffers_in_local_ddp:
        for partition in model:
            partition.zero_grad_buffer()
-    else:
-        optimizer.zero_grad()
-
-    if mpu.get_pipeline_model_parallel_world_size() > 1:
-        if args.virtual_pipeline_model_parallel_size is not None:
-            forward_backward_func = forward_backward_pipelining_with_interleaving
-            assert get_num_microbatches() % args.pipeline_model_parallel_size == 0, \
-                'number of microbatches is not divisible by pipeline-parallel ' \
-                'size when using interleaved schedule'
-        else:
-            forward_backward_func = forward_backward_pipelining_without_interleaving
-    else:
-        forward_backward_func = forward_backward_no_pipelining
+    optimizer.zero_grad()
+
+    forward_backward_func = get_forward_backward_func()
    losses_reduced = forward_backward_func(
        forward_step_func, data_iterator, model,
        optimizer, timers, forward_only=False)

+    # Empty unused memory
+    if args.empty_unused_memory_level >= 1:
+        torch.cuda.empty_cache()
+
    # All-reduce if needed.
    if args.DDP_impl == 'local':
        timers('backward-params-all-reduce').start()
@@ -419,13 +411,14 @@ def train_step(forward_step_func, data_iterator,
    # This should only run for models that support pipelined model parallelism
    # (BERT and GPT-2).
    timers('backward-embedding-all-reduce').start()
-    if (mpu.is_pipeline_first_stage(ignore_virtual=True) or
-        mpu.is_pipeline_last_stage(ignore_virtual=True)) and \
+    if mpu.is_rank_in_embedding_group(ignore_virtual=True) and \
            mpu.get_pipeline_model_parallel_world_size() > 1:
        if mpu.is_pipeline_first_stage(ignore_virtual=True):
            unwrapped_model = model[0]
        elif mpu.is_pipeline_last_stage(ignore_virtual=True):
            unwrapped_model = model[-1]
+        else:  # We do not support the interleaved schedule for T5 yet.
+            unwrapped_model = model[0]
        unwrapped_model = unwrap_model(
            unwrapped_model, (torchDDP, LocalDDP, Float16Module))

@@ -453,6 +446,10 @@ def train_step(forward_step_func, data_iterator,
    else:
        skipped_iter = 1

+    # Empty unused memory
+    if args.empty_unused_memory_level >= 2:
+        torch.cuda.empty_cache()
+
    if mpu.is_pipeline_last_stage(ignore_virtual=True):
        # Average loss across microbatches.
        loss_reduced = {}
@@ -550,6 +547,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
            writer.add_scalar('loss-scale', loss_scale, iteration)
            writer.add_scalar('loss-scale vs samples', loss_scale,
                              args.consumed_train_samples)
+        if args.log_world_size_to_tensorboard:
+            writer.add_scalar('world-size', args.world_size, iteration)
+            writer.add_scalar('world-size vs samples', args.world_size,
+                              args.consumed_train_samples)
        if grad_norm is not None:
            writer.add_scalar('grad-norm', grad_norm, iteration)
            writer.add_scalar('grad-norm vs samples', grad_norm,
@@ -565,11 +566,28 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
        if args.log_timers_to_tensorboard:
            timers.write(timers_to_log, writer, iteration,
                         normalizer=total_iterations)
+        if args.log_memory_to_tensorboard:
+            mem_stats = torch.cuda.memory_stats()
+            writer.add_scalar(
+                "mem-reserved-bytes",
+                mem_stats["reserved_bytes.all.current"],
+                iteration,
+            )
+            writer.add_scalar(
+                "mem-allocated-bytes",
+                mem_stats["allocated_bytes.all.current"],
+                iteration,
+            )
+            writer.add_scalar(
+                "mem-allocated-count",
+                mem_stats["allocation.all.current"],
+                iteration,
+            )

    if iteration % args.log_interval == 0:
        elapsed_time = timers('interval-time').elapsed()
        elapsed_time_per_iteration = elapsed_time / total_iterations
-        if writer and torch.distributed.get_rank() == 0:
+        if writer:
            if args.log_timers_to_tensorboard:
                writer.add_scalar('iteration-time',
                                  elapsed_time_per_iteration, iteration)
@@ -689,6 +707,14 @@ def train(forward_step_func, model, optimizer, lr_scheduler,

        # Checkpointing
        saved_checkpoint = False
+        if args.exit_signal_handler:
+            signal_handler = get_signal_handler()
+            if any(signal_handler.signals_received()):
+                save_checkpoint_and_time(iteration, model, optimizer,
+                                         lr_scheduler)
+                print_datetime('exiting program after receiving SIGTERM.')
+                sys.exit()
+
        if args.save and args.save_interval and \
           iteration % args.save_interval == 0:
            save_checkpoint_and_time(iteration, model, optimizer,
@@ -741,17 +767,15 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
                print_rank_0('Evaluating iter {}/{}'.format(iteration,
                                                            args.eval_iters))

-            if mpu.get_pipeline_model_parallel_world_size() > 1:
-                if args.virtual_pipeline_model_parallel_size is not None:
-                    forward_backward_func = forward_backward_pipelining_with_interleaving
-                else:
-                    forward_backward_func = forward_backward_pipelining_without_interleaving
-            else:
-                forward_backward_func = forward_backward_no_pipelining
+            forward_backward_func = get_forward_backward_func()
            loss_dicts = forward_backward_func(
                forward_step_func, data_iterator, model, optimizer=None,
                timers=None, forward_only=True)

+            # Empty unused memory
+            if args.empty_unused_memory_level >= 1:
+                torch.cuda.empty_cache()
+
            if mpu.is_pipeline_last_stage(ignore_virtual=True):
                # Reduce across processes.
                for loss_dict in loss_dicts:
@@ -784,7 +808,7 @@ def evaluate_and_print_results(prefix, forward_step_func,
        string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item())
        ppl = math.exp(min(20, total_loss_dict[key].item()))
        string += '{} PPL: {:.6E} | '.format(key, ppl)
-        if writer and is_last_rank():
+        if writer:
            writer.add_scalar('{} validation'.format(key),
                              total_loss_dict[key].item(),
                              iteration)
@@ -823,48 +847,24 @@ def build_train_valid_test_data_iterators(
            'only backward compatiblity support for iteration-based training'
        args.consumed_train_samples = args.iteration * args.global_batch_size
    if args.iteration > 0 and args.consumed_valid_samples == 0:
-        assert args.train_samples is None, \
-            'only backward compatiblity support for iteration-based training'
-        args.consumed_valid_samples = (args.iteration // args.eval_interval) * \
-            args.eval_iters * args.global_batch_size
-
-    # if args.run_dialog:
-    #     args.consumed_train_samples = 0
-    #     args.consumed_valid_samples = 0
-    #     args.iteration = 0
+        if args.train_samples is None:
+            args.consumed_valid_samples = (args.iteration // args.eval_interval) * \
+                args.eval_iters * args.global_batch_size

    # Data loader only on rank 0 of each model parallel group.
    if mpu.get_tensor_model_parallel_rank() == 0:
        
-        # if args.run_dialog:
-        #     # Build the datasets.
-        #     train_ds, valid_ds, test_ds = build_train_valid_test_datasets_provider()
-
-        #     print_rank_0(' > datasets target sizes:')
-        #     train_size = len(train_ds)
-        #     valid_size = len(valid_ds)
-        #     test_size = len(test_ds)
-        #     print_rank_0('    train:      {}'.format(train_size))
-        #     print_rank_0('    validation: {}'.format(valid_size))
-        #     print_rank_0('    test:       {}'.format(test_size))
-
-        #     batch_size = args.global_batch_size
-        #     args.train_iters = train_size // batch_size + 1
-        #     args.eval_iters = valid_size // batch_size + 1
-        #     args.test_iters = test_size // batch_size + 1
-
-        # else:
        # Number of train/valid/test samples.
        if args.train_samples:
            train_samples = args.train_samples
        else:
            train_samples = args.train_iters * args.global_batch_size
        eval_iters = (args.train_iters // args.eval_interval + 1) * \
-                    args.eval_iters
+                      args.eval_iters
        test_iters = args.eval_iters
        train_val_test_num_samples = [train_samples,
-                                    eval_iters * args.global_batch_size,
-                                    test_iters * args.global_batch_size]
+                                      eval_iters * args.global_batch_size,
+                                      test_iters * args.global_batch_size]
        print_rank_0(' > datasets target sizes (minimum size):')
        print_rank_0('    train:      {}'.format(train_val_test_num_samples[0]))
        print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))

--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -25,7 +25,7 @@ from megatron import print_rank_0
 from megatron import get_timers
 from megatron import mpu
 from megatron.data.dataset_utils import build_train_valid_test_datasets
-from megatron.model import BertModel
+from megatron.model import BertModel, ModelType
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group

@@ -143,5 +143,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):

 if __name__ == "__main__":

-    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
-             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
+    pretrain(train_valid_test_datasets_provider, model_provider,
+             ModelType.encoder_or_decoder,
+             forward_step, args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -23,7 +23,7 @@ from megatron import get_timers
 from megatron import get_tokenizer
 from megatron import mpu
 from megatron.data.gpt_dataset import build_train_valid_test_datasets
-from megatron.model import GPTModel
+from megatron.model import GPTModel, ModelType
 from megatron.training import pretrain
 from megatron.utils import get_ltor_masks_and_position_ids
 from megatron.utils import average_losses_across_data_parallel_group
@@ -121,5 +121,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):

 if __name__ == "__main__":

-    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
-             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
+    pretrain(train_valid_test_datasets_provider, model_provider,
+             ModelType.encoder_or_decoder,
+             forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
--- a/pretrain_ict.py
+++ b/pretrain_ict.py
@@ -28,6 +28,7 @@ from megatron import get_timers
 from megatron import mpu
 from megatron.data.biencoder_dataset_utils import get_ict_batch
 from megatron.data.dataset_utils import build_train_valid_test_datasets
+from megatron.model import ModelType
 from megatron.model.biencoder_model import biencoder_model_provider
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group
@@ -174,5 +175,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 if __name__ == "__main__":
    pretrain(train_valid_test_datasets_provider,
             pretrain_ict_model_provider,
+             ModelType.encoder_or_decoder,
             forward_step,
             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
--- a/pretrain_t5.py
+++ b/pretrain_t5.py
@@ -26,18 +26,58 @@ from megatron import (
    print_rank_0
 )
 from megatron.data.dataset_utils import build_train_valid_test_datasets
-from megatron.model import T5Model
+from megatron.model import T5Model, ModelType
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group


-def model_provider(pre_process=True, post_process=True):
+"""
+Pipeline parallelism for T5
+===========================
+
+T5 is a model architecture with both encoder and decoder blocks.
+Consequently, pipeline parallelism is implemented slightly differently
+compared to architectures like GPT and BERT.
+
+In particular, when pipeline_model_parallel_world_size > 1, each stage
+either executes an encoder block or a decoder block. The
+--pipeline-model-parallel-split-rank argument controls the rank at which
+the split happens: all ranks lower than this argument execute the
+encoder block, and all ranks equal to or higher than this argument value
+execute the decoder block.
+
+In the encoder section of the model, only one tensor is sent downstream:
+the intermediate encoder_hidden_state. In the decoder section of the
+model, two tensors are sent downstream in the forward pass: the fully
+computed encoder_hidden_state, and the intermediate decoder_hidden_state.
+
+In particular, these are the shapes of the tensors sent between
+different workers:
+    If rank is in decoder section:
+        intermediate decoder_hidden_state (pre-transpose),
+        complete encoder_hidden_state (post-transpose).
+    If rank is at boundary between encoder and decoder sections:
+        complete encoder_hidden_state (post-transpose).
+    If rank is in encoder section:
+        intermediate encoder_hidden_state (pre-transpose).
+
+Additionally, we have code in the backward_step function in schedules.py
+to accumulate the encoder_hidden_state gradient across skip connections
+(encoder_hidden_state fed in as input to each layer in the decoder).
+"""
+
+
+def model_provider(pre_process=True, post_process=True,
+                   add_encoder=True, add_decoder=True):
    """Build the model."""
-    assert pre_process and post_process, "T5 doesn't yet support pipelining"

    print_rank_0('building T5 model ...')
    model = T5Model(num_tokentypes=0,
-                    parallel_output=True)
+                    parallel_output=True,
+                    pre_process=pre_process,
+                    post_process=post_process,
+                    add_encoder=add_encoder,
+                    add_decoder=add_decoder)
    return model


@@ -70,9 +110,7 @@ def get_batch(data_iterator):


 def loss_func(loss_mask, output_tensor):
-    lm_loss_, _ = output_tensor
-
-    lm_loss_ = lm_loss_.float()
+    lm_loss_ = output_tensor.float()
    lm_loss = torch.sum(
        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()

@@ -130,5 +168,5 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):

 if __name__ == "__main__":

-    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
-             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
+    pretrain(train_valid_test_datasets_provider, model_provider, ModelType.encoder_and_decoder,
+             forward_step, args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
--- a/pretrain_vit.py
+++ b/pretrain_vit.py
@@ -20,6 +20,7 @@ import torch.nn.functional as F
 from functools import partial
 from megatron import get_args, get_timers, mpu, print_rank_0
 from megatron.data.vit_dataset import build_train_valid_datasets
+from megatron.model import ModelType
 from megatron.model.vit_model import VitModel
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group
@@ -92,6 +93,7 @@ if __name__ == "__main__":
    pretrain(
        train_valid_test_datasets_provider,
        model_provider,
+        ModelType.encoder_or_decoder,
        forward_step,
        args_defaults={'dataloader_type': 'cyclic'}
    )