Merge branch 'main' into main

a7f882fe · Satpal Singh Rathore · GitHub · 8f241a96 · 9d86ca67 · a7f882fe
Unverified Commit a7f882fe authored Dec 25, 2021 by Satpal Singh Rathore Committed by GitHub Dec 25, 2021
20 changed files
--- a/megatron/model/module.py
+++ b/megatron/model/module.py
@@ -187,6 +187,10 @@ class Float16Module(MegatronModule):
        self.float16_convertor = float16_convertor
+    def set_input_tensor(self, input_tensor):
+        return self.module.set_input_tensor(input_tensor)
    def forward(self, *inputs, **kwargs):
        if mpu.is_pipeline_first_stage():
            inputs = fp32_to_float16(inputs, self.float16_convertor)

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -179,11 +179,6 @@ class ParallelAttention(MegatronModule):
            init_method=output_layer_init_method,
            skip_bias_add=True)
-        # Inference key-value memory
-        self.inference_key_memory = None
-        self.inference_value_memory = None
-        self.inference_current_sequence_len = 0
    def _allocate_memory(self, inference_max_sequence_len, batch_size):
        return torch.empty(
@@ -196,35 +191,27 @@ class ParallelAttention(MegatronModule):
    def forward(self, hidden_states, attention_mask,
-                encoder_output=None,
+                encoder_output=None, inference_params=None):
-                set_inference_key_value_memory=False,
-                inference_max_sequence_len=None):
        # hidden_states: [sq, b, h]
        # =================================================
        # Pre-allocate memory for key-values for inference.
        # =================================================
-        if set_inference_key_value_memory:
+        if inference_params:
-            assert inference_max_sequence_len and inference_max_sequence_len > 0
+            if self.layer_number not in inference_params.key_value_memory_dict:
-            self.inference_key_memory = self._allocate_memory(
+                inf_max_seq_len = inference_params.max_sequence_len
-                inference_max_sequence_len, hidden_states.size(1))
+                inf_max_batch_size = inference_params.max_batch_size
-            self.inference_value_memory = self._allocate_memory(
+                inference_key_memory = self._allocate_memory(
-                inference_max_sequence_len, hidden_states.size(1))
+                    inf_max_seq_len, inf_max_batch_size)
-            self.inference_current_sequence_len = 0
+                inference_value_memory = self._allocate_memory(
-        # Some consistency check.
+                    inf_max_seq_len, inf_max_batch_size)
-        if inference_max_sequence_len:
+                inference_params.key_value_memory_dict[self.layer_number] = (
-            assert self.inference_current_sequence_len < \
+                    inference_key_memory, inference_value_memory)
-                self.inference_key_memory.size(0)
+            else:
-            assert inference_max_sequence_len == \
+                inference_key_memory, inference_value_memory = \
-                self.inference_key_memory.size(0)
+                    inference_params.key_value_memory_dict[self.layer_number]
-        # This is added for safety. In case inference_max_sequence_len
-        # is not provided, make sure there is no potential memory left
-        # from previous inference.
-        if not inference_max_sequence_len:
-            self.inference_key_memory = None
-            self.inference_value_memory = None
        # =====================
        # Query, Key, and Value
@@ -267,22 +254,26 @@ class ParallelAttention(MegatronModule):
            query_layer = query_layer.view(*new_tensor_shape)
-        # ===================================================
+        # ==================================
-        # Adjust key, value, and attention mask for inference
+        # Adjust key and value for inference
-        # ===================================================
+        # ==================================
-        if inference_max_sequence_len:
+        if inference_params:
-            # Adjust the range variables.
+            batch_start = inference_params.batch_size_offset
-            start = self.inference_current_sequence_len
+            batch_end = batch_start + key_layer.size(1)
-            self.inference_current_sequence_len += key_layer.size(0)
+            assert batch_end <= inference_key_memory.size(1)
-            end = self.inference_current_sequence_len
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + key_layer.size(0)
+            assert sequence_end <= inference_key_memory.size(0)
            # Copy key and values.
-            self.inference_key_memory[start:end, ...] = key_layer
+            inference_key_memory[sequence_start:sequence_end,
-            self.inference_value_memory[start:end, ...] = value_layer
+                                 batch_start:batch_end, ...] = key_layer
-            key_layer = self.inference_key_memory[:end, ...]
+            inference_value_memory[sequence_start:sequence_end,
-            value_layer = self.inference_value_memory[:end, ...]
+                                   batch_start:batch_end, ...] = value_layer
-            # Adjust attention mask
+            key_layer = inference_key_memory[
-            attention_mask = attention_mask[..., start:end, :end]
+                :sequence_end, batch_start:batch_end, ...]
+            value_layer = inference_value_memory[
+                :sequence_end, batch_start:batch_end, ...]
        # ===================================
@@ -432,7 +423,8 @@ class ParallelTransformerLayer(MegatronModule):
        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(
            args.hidden_size,
-            eps=args.layernorm_epsilon)
+            eps=args.layernorm_epsilon,
+            no_persist_layer_norm=args.no_persist_layer_norm)
        # Self attention.
        self.self_attention = ParallelAttention(
@@ -447,7 +439,8 @@ class ParallelTransformerLayer(MegatronModule):
        # Layernorm on the attention output
        self.post_attention_layernorm = LayerNorm(
            args.hidden_size,
-            eps=args.layernorm_epsilon)
+            eps=args.layernorm_epsilon,
+            no_persist_layer_norm=args.no_persist_layer_norm)
        if self.layer_type == LayerType.decoder:
            self.inter_attention = ParallelAttention(
@@ -458,17 +451,16 @@ class ParallelTransformerLayer(MegatronModule):
            # Layernorm on the attention output.
            self.post_inter_attention_layernorm = LayerNorm(
                args.hidden_size,
-                eps=args.layernorm_epsilon)
+                eps=args.layernorm_epsilon,
+                no_persist_layer_norm=args.no_persist_layer_norm)
        # MLP
        self.mlp = ParallelMLP(init_method,
                               output_layer_init_method)
    def forward(self, hidden_states, attention_mask,
-                encoder_output=None,
+                encoder_output=None, enc_dec_attn_mask=None,
-                enc_dec_attn_mask=None,
+                inference_params=None):
-                set_inference_key_value_memory=False,
-                inference_max_sequence_len=None):
        # hidden_states: [b, s, h]
        # Layer norm at the beginning of the transformer layer.
@@ -478,8 +470,7 @@ class ParallelTransformerLayer(MegatronModule):
            self.self_attention(
                layernorm_output,
                attention_mask,
-                set_inference_key_value_memory=set_inference_key_value_memory,
+                inference_params=inference_params)
-                inference_max_sequence_len=inference_max_sequence_len)
        # Residual connection.
        if self.apply_residual_connection_post_layernorm:
@@ -589,6 +580,7 @@ class ParallelTransformer(MegatronModule):
            assert args.num_layers % args.virtual_pipeline_model_parallel_size == 0, \
                'num_layers_per_stage must be divisible by ' \
                'virtual_pipeline_model_parallel_size'
+            assert args.model_type != ModelType.encoder_and_decoder
            # Number of layers in each model chunk is the number of layers in the stage,
            # divided by the number of model chunks in a stage.
            self.num_layers = self.num_layers // args.virtual_pipeline_model_parallel_size
@@ -605,7 +597,16 @@ class ParallelTransformer(MegatronModule):
                (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
        else:
            # Each stage gets a contiguous set of layers.
-            offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers
+            if args.model_type == ModelType.encoder_and_decoder and \
+                    mpu.get_pipeline_model_parallel_world_size() > 1:
+                pipeline_rank = mpu.get_pipeline_model_parallel_rank()
+                if layer_type == LayerType.encoder:
+                    offset = pipeline_rank * self.num_layers
+                else:
+                    num_ranks_in_enc = args.pipeline_model_parallel_split_rank
+                    offset = (pipeline_rank - num_ranks_in_enc) * self.num_layers
+            else:
+                offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers
        self.layers = torch.nn.ModuleList(
            [build_layer(i + 1 + offset) for i in range(self.num_layers)])
@@ -614,7 +615,8 @@ class ParallelTransformer(MegatronModule):
            # Final layer norm before output.
            self.final_layernorm = LayerNorm(
                args.hidden_size,
-                eps=args.layernorm_epsilon)
+                eps=args.layernorm_epsilon,
+                no_persist_layer_norm=args.no_persist_layer_norm)
    def _get_layer(self, layer_number):
        return self.layers[layer_number]
@@ -634,23 +636,6 @@ class ParallelTransformer(MegatronModule):
                return x_
            return custom_forward
-        def distribute_checkpointed_activations_helper(layer_number):
-            """Distribute checkpointed activations across the tensor model
-               Parallel ranks if the `distribute-checkpointed-activations
-               is on and either of the following conditions is met:
-                 - it is not the first layer in the in the pipeline stage.
-                   The first layer is used in the pipeline parallelism 
-                   and changing its shape throws error in the backward pass.
-                 - we are at the first pipline stage so the input tensor is
-                   not used in pipeline parallelism. Note that no pipeline
-                   parallelism is a special case of this.
-            """
-            not_first_layer_in_pipeline_stage = (layer_number > 0)
-            is_first_pipeline_stage = (
-                mpu.get_pipeline_model_parallel_rank() == 0)
-            return self.distribute_checkpointed_activations and \
-                (not_first_layer_in_pipeline_stage or is_first_pipeline_stage)
        if self.activations_checkpoint_method == 'uniform':
            # Uniformly divide the total number of Transformer layers and checkpoint
            # the input activation of each divided chunk.
@@ -659,7 +644,7 @@ class ParallelTransformer(MegatronModule):
            while l < self.num_layers:
                hidden_states = mpu.checkpoint(
                    custom(l, l + self.activations_checkpoint_num_layers),
-                    distribute_checkpointed_activations_helper(l),
+                    self.distribute_checkpointed_activations,
                    hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
                l += self.activations_checkpoint_num_layers
        elif self.activations_checkpoint_method == 'block':
@@ -670,7 +655,7 @@ class ParallelTransformer(MegatronModule):
                if l < self.activations_checkpoint_num_layers:
                    hidden_states = mpu.checkpoint(
                        custom(l, l + 1),
-                        distribute_checkpointed_activations_helper(l),
+                        self.distribute_checkpointed_activations,
                        hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
                else:
                    hidden_states = custom(l, l + 1)(
@@ -691,13 +676,11 @@ class ParallelTransformer(MegatronModule):
        self.input_tensor = input_tensor
    def forward(self, hidden_states, attention_mask,
-                encoder_output=None,
+                encoder_output=None, enc_dec_attn_mask=None,
-                enc_dec_attn_mask=None,
+                inference_params=None):
-                set_inference_key_value_memory=False,
-                inference_max_sequence_len=None):
        # Checks.
-        if inference_max_sequence_len:
+        if inference_params:
            assert self.activations_checkpoint_method is None, \
                'inference does not work with activation checkpointing'
@@ -729,8 +712,8 @@ class ParallelTransformer(MegatronModule):
                    attention_mask,
                    encoder_output=encoder_output,
                    enc_dec_attn_mask=enc_dec_attn_mask,
-                    set_inference_key_value_memory=set_inference_key_value_memory,
+                    inference_params=inference_params)
-                    inference_max_sequence_len=inference_max_sequence_len)
        # Final layer norm.
        if self.post_process:

--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -25,6 +25,7 @@ from .initialize import get_data_parallel_group
 from .initialize import get_data_parallel_rank
 from .initialize import get_data_parallel_world_size
 from .initialize import get_embedding_group
+from .initialize import get_position_embedding_group
 from .initialize import get_model_parallel_group
 from .initialize import get_tensor_model_parallel_group
 from .initialize import get_pipeline_model_parallel_group
@@ -32,6 +33,7 @@ from .initialize import get_tensor_model_parallel_rank, set_tensor_model_paralle
 from .initialize import get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank
 from .initialize import is_pipeline_first_stage, is_pipeline_last_stage
 from .initialize import is_rank_in_embedding_group
+from .initialize import is_rank_in_position_embedding_group
 from .initialize import is_pipeline_stage_before_split, is_pipeline_stage_after_split
 from .initialize import is_pipeline_stage_at_split
 from .initialize import get_num_layers

--- a/megatron/mpu/initialize.py
+++ b/megatron/mpu/initialize.py
@@ -29,6 +29,8 @@ _PIPELINE_MODEL_PARALLEL_GROUP = None
 _MODEL_PARALLEL_GROUP = None
 # Embedding group.
 _EMBEDDING_GROUP = None
+# Position embedding group.
+_POSITION_EMBEDDING_GROUP = None
 # Data parallel group that the current rank belongs to.
 _DATA_PARALLEL_GROUP = None
@@ -45,6 +47,9 @@ _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
 # A list of ranks that have a copy of the embedding.
 _EMBEDDING_GLOBAL_RANKS = None
+# A list of ranks that have a copy of the position embedding.
+_POSITION_EMBEDDING_GLOBAL_RANKS = None
 # A list of global ranks for each pipeline group to ease calculation of the source
 # rank when broadcasting from the first or last pipeline stage.
 _PIPELINE_GLOBAL_RANKS = None
@@ -165,6 +170,10 @@ def initialize_model_parallel(tensor_model_parallel_size_=1,
    global _EMBEDDING_GLOBAL_RANKS
    assert _EMBEDDING_GROUP is None, \
        'embedding group is already initialized'
+    global _POSITION_EMBEDDING_GROUP
+    global _POSITION_EMBEDDING_GLOBAL_RANKS
+    assert _POSITION_EMBEDDING_GROUP is None, \
+        'position embedding group is already initialized'
    for i in range(num_pipeline_model_parallel_groups):
        ranks = range(i, world_size,
                      num_pipeline_model_parallel_groups)
@@ -176,19 +185,31 @@ def initialize_model_parallel(tensor_model_parallel_size_=1,
        # first and last stages).
        if len(ranks) > 1:
            embedding_ranks = [ranks[0], ranks[-1]]
-            if pipeline_model_parallel_split_rank_ is not None and \
+            position_embedding_ranks = [ranks[0]]
-                    pipeline_model_parallel_split_rank_ not in embedding_ranks:
+            if pipeline_model_parallel_split_rank_ is not None:
-                embedding_ranks = [ranks[0],
+                if ranks[pipeline_model_parallel_split_rank_] not in embedding_ranks:
-                                   ranks[pipeline_model_parallel_split_rank_],
+                    embedding_ranks = [ranks[0],
-                                   ranks[-1]]
+                                       ranks[pipeline_model_parallel_split_rank_],
+                                       ranks[-1]]
+                if ranks[pipeline_model_parallel_split_rank_] not in position_embedding_ranks:
+                    position_embedding_ranks = [ranks[0],
+                                       ranks[pipeline_model_parallel_split_rank_]]
        else:
            embedding_ranks = ranks
+            position_embedding_ranks = ranks
        group = torch.distributed.new_group(embedding_ranks)
        if rank in embedding_ranks:
            _EMBEDDING_GROUP = group
        if rank in ranks:
            _EMBEDDING_GLOBAL_RANKS = embedding_ranks
+        group = torch.distributed.new_group(position_embedding_ranks)
+        if rank in position_embedding_ranks:
+            _POSITION_EMBEDDING_GROUP = group
+        if rank in ranks:
+            _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
 def model_parallel_is_initialized():
    """Check if model and data parallel groups are initialized."""
@@ -234,6 +255,13 @@ def get_embedding_group():
    return _EMBEDDING_GROUP
+def get_position_embedding_group():
+    """Get the position embedding group the caller rank belongs to."""
+    assert _POSITION_EMBEDDING_GROUP is not None, \
+        'position embedding group is not initialized'
+    return _POSITION_EMBEDDING_GROUP
 def set_tensor_model_parallel_world_size(world_size):
    """Set the tensor model parallel size"""
    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
@@ -352,6 +380,13 @@ def is_rank_in_embedding_group(ignore_virtual=False):
    return False
+def is_rank_in_position_embedding_group():
+    """Return true if current rank is in position embedding group, False otherwise."""
+    rank = torch.distributed.get_rank()
+    global _POSITION_EMBEDDING_GLOBAL_RANKS
+    return rank in _POSITION_EMBEDDING_GLOBAL_RANKS
 def is_pipeline_stage_before_split(rank=None):
    """Return True if pipeline stage executes encoder block for a model
    with both encoder and decoder."""
@@ -467,3 +502,5 @@ def destroy_model_parallel():
    _DATA_PARALLEL_GROUP = None
    global _EMBEDDING_GROUP
    _EMBEDDING_GROUP = None
+    global _POSITION_EMBEDDING_GROUP
+    _POSITION_EMBEDDING_GROUP = None
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -24,7 +24,6 @@ from torch import _C
 from torch.cuda import _lazy_call, device as device_ctx_manager
 from torch.utils.checkpoint import detach_variable
-from megatron import get_args
 from megatron.memory import allocate_mem_buff
 from .initialize import get_data_parallel_rank

--- a/megatron/schedules.py
+++ b/megatron/schedules.py
@@ -15,6 +15,7 @@
 from contextlib import contextmanager
 import torch
+from torch.autograd.variable import Variable
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from megatron import get_args
@@ -27,7 +28,6 @@ from megatron.model import DistributedDataParallel as LocalDDP
 from megatron.model import Float16Module
 from megatron.model import ModelType
 def get_forward_backward_func():
    args = get_args()
    if mpu.get_pipeline_model_parallel_world_size() > 1:
@@ -42,6 +42,54 @@ def get_forward_backward_func():
        forward_backward_func = forward_backward_no_pipelining
    return forward_backward_func
+def free_output_tensor(output_tensors):
+    '''Pseudo-free (i.e., set to scalar) the output tensor's '.data' field.
+    This method should be called right after the output tensor has been
+    sent to the next pipeline stage. At this point, the output tensor is
+    only useful for its '.grad_fn' field, and not its '.data'.
+    '''
+    if output_tensors is None:
+        return
+    if isinstance(output_tensors, torch.Tensor):
+        output_tensors = [output_tensors]
+    for output_tensor in output_tensors:
+        output_tensor.data = torch.cuda.FloatTensor([0])
+def custom_backward(output, grad_output):
+    '''Directly call C++ autograd engine.
+    To make the 'free_output_tensor' (above) optimization work, the C++
+    autograd engine must be called directly, bypassing Pytorch's
+    torch.autograd.backward. Pytorch's 'backward' checks that the output and
+    grad have the same shape, while C++'s 'backward' does not.
+    '''
+    assert output.numel() == 1, \
+        "output should be pseudo-'freed' in schedule, to optimize memory"
+    assert isinstance(output, torch.Tensor), \
+        "output == '%s'." % type(output).__name__
+    assert isinstance(grad_output, (torch.Tensor, type(None))), \
+        "grad_output == '%s'." % type(grad_output).__name__
+    # Handle scalar output
+    if grad_output is None:
+        assert output.numel() == 1, "implicit grad requires scalar output."
+        grad_output = torch.ones_like(
+            output,
+            memory_format = torch.preserve_format,
+        )
+    # Call c++ engine [ see torch/csrc/autograd/python_engine.cpp ]
+    Variable._execution_engine.run_backward(
+        tensors = (output,),
+        grad_tensors = (grad_output,),
+        keep_graph = False,
+        create_graph = False,
+        inputs = tuple(),
+        allow_unreachable=True,
+        accumulate_grad=True,
+    )
 def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_reduced):
    """Forward step for passed-in model.
@@ -116,7 +164,7 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
    # Backward pass.
    if output_tensor_grad[0] is None:
        output_tensor = optimizer.scale_loss(output_tensor[0])
-    torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0])
+    custom_backward(output_tensor[0], output_tensor_grad[0])
    # Collect the grad of the input_tensor.
    input_tensor_grad = [None]
@@ -324,6 +372,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
                    output_tensor, recv_prev=recv_prev,
                    tensor_shape=tensor_shape,
                    timers=timers)
+        free_output_tensor(output_tensor)
        input_tensors[next_forward_model_chunk_id].append(input_tensor)
    # Run 1F1B in steady state.
@@ -388,6 +437,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
                    output_tensor, input_tensor_grad,
                    recv_prev=recv_prev, recv_next=recv_next,
                    tensor_shape=tensor_shape, timers=timers)
+        free_output_tensor(output_tensor)
        # Put input_tensor and output_tensor_grad in data structures in the
        # right location.
@@ -562,6 +612,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
        if not forward_only:
            input_tensors.append(input_tensor)
            output_tensors.append(output_tensor)
+            free_output_tensor(output_tensor)
    # Before running 1F1B, need to receive first forward tensor.
    # If all microbatches are run in warmup / cooldown phase, then no need to
@@ -590,6 +641,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
            # Add input_tensor and output_tensor to end of list.
            input_tensors.append(input_tensor)
            output_tensors.append(output_tensor)
+            free_output_tensor(output_tensor)
            # Pop input_tensor and output_tensor from the start of the list for
            # the backward pass.

--- a/megatron/text_generation/__init__.py
+++ b/megatron/text_generation/__init__.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .api import (
+    generate,
+    generate_and_post_process)
--- a/megatron/text_generation/api.py
+++ b/megatron/text_generation/api.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference API."""
+import torch
+from megatron import mpu
+from .communication import broadcast_float_list
+from .generation import (
+        generate_tokens_probs_and_return_on_first_stage,
+        score_and_return_on_first_stage)
+from .tokenization import (
+    tokenize_prompts,
+    detokenize_generations)
+def generate_and_post_process(model,
+                              prompts=None,
+                              tokens_to_generate=0,
+                              return_output_log_probs=False,
+                              top_k_sampling=0,
+                              top_p_sampling=0.0,
+                              temperature=1.0,
+                              add_BOS=False,
+                              use_eod_token_for_early_termination=True):
+    """Run inference and post-process outputs, i.e., detokenize,
+    move to cpu and convert to list."""
+    # Main inference.
+    tokens, lengths, output_log_probs = generate(
+        model,
+        prompts=prompts,
+        tokens_to_generate=tokens_to_generate,
+        return_output_log_probs=return_output_log_probs,
+        top_k_sampling=top_k_sampling,
+        top_p_sampling=top_p_sampling,
+        temperature=temperature,
+        add_BOS=add_BOS,
+        use_eod_token_for_early_termination=use_eod_token_for_early_termination)
+    # Only post-process on first stage.
+    if mpu.is_pipeline_first_stage():
+        tokens, prompts_plus_generations, prompts_plus_generations_segments = \
+            detokenize_generations(tokens, lengths, True)
+        if return_output_log_probs:
+            output_log_probs = output_log_probs.cpu().numpy().tolist()
+            for i, (prob, seg) in enumerate(zip(output_log_probs, prompts_plus_generations_segments)):
+                output_log_probs[i] = prob[:len(seg)-1]
+        return prompts_plus_generations, prompts_plus_generations_segments, \
+            output_log_probs, tokens
+    return None
+def generate(model,
+             prompts=None,
+             tokens_to_generate=0,
+             return_output_log_probs=False,
+             top_k_sampling=0,
+             top_p_sampling=0.0,
+             temperature=1.0,
+             add_BOS=False,
+             use_eod_token_for_early_termination=True):
+    """Given prompts and input parameters, run inference and return:
+       tokens: prompts plus the generated tokens.
+       lengths: length of the prompt + generations. Note that we can
+           discard tokens in the tokens tensor that are after the
+           corresponding length.
+       output_log_probs: log probs of the tokens.
+    """
+    # Make sure input params are avaialble to all ranks.
+    values = [tokens_to_generate,
+              return_output_log_probs,
+              top_k_sampling, top_p_sampling,
+              temperature, add_BOS, use_eod_token_for_early_termination]
+    values_float_tensor = broadcast_float_list(7, float_list=values)
+    tokens_to_generate = int(values_float_tensor[0].item())
+    return_output_log_probs = bool(values_float_tensor[1].item())
+    top_k_sampling = int(values_float_tensor[2].item())
+    top_p_sampling = values_float_tensor[3].item()
+    temperature = values_float_tensor[4].item()
+    add_BOS = bool(values_float_tensor[5].item())
+    use_eod_token_for_early_termination = bool(values_float_tensor[6].item())
+    # Tokenize prompts and get the batch.
+    # Note that these tensors are broadcaseted to all ranks.
+    if torch.distributed.get_rank() == 0:
+        assert prompts is not None
+    context_tokens_tensor, context_length_tensor = tokenize_prompts(
+        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
+    if tokens_to_generate == 0:
+        return score_and_return_on_first_stage(
+            model, context_tokens_tensor, context_length_tensor)
+    # Main inference function.
+    # Note that the outputs are available on the first stage.
+    return generate_tokens_probs_and_return_on_first_stage(
+        model, context_tokens_tensor, context_length_tensor,
+        return_output_log_probs=return_output_log_probs,
+        top_k=top_k_sampling,
+        top_p=top_p_sampling,
+        temperature=temperature,
+        use_eod_token_for_early_termination=use_eod_token_for_early_termination)
--- a/megatron/text_generation/communication.py
+++ b/megatron/text_generation/communication.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Communications utilities."""
+import torch
+from megatron import mpu
+# TODO: use functions from megatron/p2p
+def recv_from_prev_pipeline_rank_(recv_buffer=None):
+    """Receive from previous pipeline stage and update the
+    input buffer inplace."""
+    if not mpu.is_pipeline_first_stage():
+        assert recv_buffer is not None
+        recv_prev_op = torch.distributed.P2POp(
+            torch.distributed.irecv, recv_buffer,
+            mpu.get_pipeline_model_parallel_prev_rank())
+        reqs = torch.distributed.batch_isend_irecv([recv_prev_op])
+        for req in reqs:
+            req.wait()
+        # To protect against race condition when using batch_isend_irecv().
+        torch.cuda.synchronize()
+# TODO: use functions from megatron/p2p
+def send_to_next_pipeline_rank(tensor=None):
+    """Send output to the next pipeline stage."""
+    if not mpu.is_pipeline_last_stage():
+        assert tensor is not None
+        send_next_op = torch.distributed.P2POp(
+            torch.distributed.isend, tensor,
+            mpu.get_pipeline_model_parallel_next_rank())
+        reqs = torch.distributed.batch_isend_irecv([send_next_op])
+        for req in reqs:
+            req.wait()
+        # To protect against race condition when using batch_isend_irecv().
+        torch.cuda.synchronize()
+def _is_cuda(tensor):
+    """Check if a tensor is not none and is cuda."""
+    assert tensor is not None
+    assert tensor.is_cuda
+def _is_cuda_contiguous(tensor):
+    """Check if a tensor is not none, is cuda, and is contiguous."""
+    _is_cuda(tensor)
+    assert tensor.is_contiguous()
+def broadcast_from_last_pipeline_stage(size, dtype, tensor=None):
+    """Broadcast a tensor from last pipeline stage to all ranks."""
+    is_last_stage = mpu.is_pipeline_last_stage()
+    # If first stage and last state are the same, then there is no
+    # pipeline parallelism and no need to communicate.
+    if mpu.is_pipeline_first_stage() and is_last_stage:
+        return tensor
+    if is_last_stage:
+        _is_cuda_contiguous(tensor)
+    else:
+        tensor = torch.empty(size,
+                             dtype=dtype,
+                             device=torch.cuda.current_device())
+    # Get the group and corresponding source rank.
+    src = mpu.get_pipeline_model_parallel_last_rank()
+    group = mpu.get_pipeline_model_parallel_group()
+    torch.distributed.broadcast(tensor, src, group)
+    return tensor
+def broadcast_from_last_to_first_pipeline_stage(size, dtype, tensor=None):
+    """Broadcast tensor values from last stage into the first stage."""
+    is_last_stage = mpu.is_pipeline_last_stage()
+    is_first_stage = mpu.is_pipeline_first_stage()
+    # If first stage and last state are the same, then there is no
+    # pipeline parallelism and no need to communicate.
+    if is_first_stage and is_last_stage:
+        return tensor
+    # Only first and last stage pipeline stages need to be involved.
+    if is_last_stage or is_first_stage:
+        if is_last_stage:
+            _is_cuda_contiguous(tensor)
+        else:
+            tensor = torch.empty(size,
+                                 dtype=dtype,
+                                 device=torch.cuda.current_device())
+        src = mpu.get_pipeline_model_parallel_last_rank()
+        group = mpu.get_embedding_group()
+        # Broadcast from last stage into the first stage.
+        torch.distributed.broadcast(tensor, src, group)
+    else:
+        tensor = None
+    return tensor
+def copy_from_last_to_first_pipeline_stage(size, dtype, tensor=None):
+    """Copy tensor values from last stage into the first stage.
+    Note that the input tensor is updated in place."""
+    is_last_stage = mpu.is_pipeline_last_stage()
+    is_first_stage = mpu.is_pipeline_first_stage()
+    # If first stage and last state are the same, then there is no
+    # pipeline parallelism and no need to communicate.
+    if is_first_stage and is_last_stage:
+        return
+    # Only first and last stage pipeline stages need to be involved.
+    if is_last_stage or is_first_stage:
+        _is_cuda(tensor)
+        is_contiguous = tensor.is_contiguous()
+        src = mpu.get_pipeline_model_parallel_last_rank()
+        group = mpu.get_embedding_group()
+        if is_contiguous:
+            tensor_ = tensor
+        else:
+            if is_last_stage:
+                tensor_ = tensor.contiguous()
+            else:
+                tensor_ = torch.empty(size,
+                                      dtype=dtype,
+                                      device=torch.cuda.current_device())
+        # Broadcast from last stage into the first stage.
+        torch.distributed.broadcast(tensor_, src, group)
+        # Update the first stage tensor
+        if is_first_stage and not is_contiguous:
+            tensor[...] = tensor_
+def broadcast_tensor(size, dtype, tensor=None, rank=0):
+    """ Given size and type of a tensor on all ranks and the tensor value
+        only on a specific rank, broadcast from that rank to all other ranks.
+    """
+    if torch.distributed.get_rank() == rank:
+        _is_cuda_contiguous(tensor)
+    else:
+        tensor = torch.empty(size,
+                             dtype=dtype,
+                             device=torch.cuda.current_device())
+    torch.distributed.broadcast(tensor, rank)
+    return tensor
+def broadcast_list(size, dtype, list_values=None, rank=0):
+    """Broadcast a list of values with a given type."""
+    tensor = None
+    if torch.distributed.get_rank() == rank:
+        tensor = torch.tensor(list_values, dtype=dtype,
+                              device=torch.cuda.current_device())
+    return broadcast_tensor(size, dtype, tensor=tensor, rank=rank)
+def broadcast_int_list(size, int_list=None, rank=0):
+    """Broadcast a list of interger values."""
+    return broadcast_list(size, torch.int64, list_values=int_list, rank=rank)
+def broadcast_float_list(size, float_list=None, rank=0):
+    """Broadcast a list of float values."""
+    return broadcast_list(size, torch.float32, list_values=float_list,
+                          rank=rank)
--- a/megatron/text_generation/forward_step.py
+++ b/megatron/text_generation/forward_step.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Forward step utilities."""
+from collections.abc import Iterable
+import torch
+from megatron import (
+    get_args,
+    mpu)
+from .communication import (
+    send_to_next_pipeline_rank,
+    recv_from_prev_pipeline_rank_)
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+    def __init__(self, max_batch_size, max_sequence_len):
+        """Note that offsets are set to zero and we always set the
+        flag to allocate memory. After the first call, make sure to
+        set this flag to False."""
+        self.max_sequence_len = max_sequence_len
+        self.max_batch_size = max_batch_size
+        self.sequence_len_offset = 0
+        self.batch_size_offset = 0
+        self.key_value_memory_dict = {}
+class ForwardStep:
+    """Forward step function with all the communications.
+    We use a class here to hide the inference parameters
+    from the outside caller."""
+    def __init__(self, model, max_batch_size, max_sequence_len):
+        """Set values so we don't need to do it multiple times."""
+        # Make sure model is in eval mode.
+        assert not isinstance(model, Iterable), \
+            'interleaving schedule is not supported for inference'
+        model.eval()
+        self.model = model
+        # Initialize inference parameters.
+        self.inference_params = InferenceParams(max_batch_size,
+                                                max_sequence_len)
+        # Pipelining arguments.
+        args = get_args()
+        self.pipeline_size_larger_than_one = (
+            args.pipeline_model_parallel_size > 1)
+        # Threshold of pipelining.
+        self.pipelining_batch_x_seqlen = \
+            args.inference_batch_times_seqlen_threshold
+    def __call__(self, tokens, position_ids, attention_mask):
+        """Invocation of the forward methods. Note that self.inference_params
+        is being modified by the forward step."""
+        # Pipelining case.
+        if self.pipeline_size_larger_than_one:
+            current_batch_x_seqlen = tokens.size(0) * tokens.size(1)
+            if current_batch_x_seqlen >= self.pipelining_batch_x_seqlen:
+                micro_batch_size = \
+                    max(1, self.pipelining_batch_x_seqlen // tokens.size(1))
+                return _with_pipelining_forward_step(self.model,
+                                                     tokens,
+                                                     position_ids,
+                                                     attention_mask,
+                                                     self.inference_params,
+                                                     micro_batch_size)
+        return _no_pipelining_forward_step(self.model,
+                                           tokens,
+                                           position_ids,
+                                           attention_mask,
+                                           self.inference_params)
+def _get_recv_buffer_dtype(args):
+    """Receive happens between the layers."""
+    if args.fp32_residual_connection:
+        return torch.float
+    return args.params_dtype
+def _allocate_recv_buffer(batch_size, sequence_length):
+    """Receive happens between the layers with size [s, b, h]."""
+    if mpu.is_pipeline_first_stage():
+        return None
+    args = get_args()
+    recv_size = (sequence_length, batch_size, args.hidden_size)
+    return torch.empty(recv_size,
+                       dtype=_get_recv_buffer_dtype(args),
+                       device=torch.cuda.current_device())
+def _forward_step_helper(model, tokens, position_ids, attention_mask,
+                         inference_params, recv_buffer=None):
+    """Single forward step. Update the allocate memory flag so
+    only the first time the memory is allocated."""
+    batch_size = tokens.size(0)
+    sequence_length = tokens.size(1)
+    if recv_buffer is None:
+        recv_buffer = _allocate_recv_buffer(batch_size, sequence_length)
+    # Receive from previous stage.
+    recv_from_prev_pipeline_rank_(recv_buffer)
+    # Forward pass through the model.
+    model.set_input_tensor(recv_buffer)
+    output_tensor = model(tokens, position_ids, attention_mask,
+                          inference_params=inference_params)
+    # Send output to the next stage.
+    send_to_next_pipeline_rank(output_tensor)
+    return output_tensor
+def _no_pipelining_forward_step(model, tokens, position_ids, attention_mask,
+                                inference_params, recv_buffer=None):
+    """If recv_buffer is none, we will allocate one on the fly."""
+    # Run a simple forward pass.
+    output_tensor = _forward_step_helper(model, tokens, position_ids,
+                                         attention_mask, inference_params,
+                                         recv_buffer=recv_buffer)
+    # Update the sequence length offset.
+    inference_params.sequence_len_offset += tokens.size(1)
+    logits = None
+    if mpu.is_pipeline_last_stage():
+        logits = output_tensor
+    return logits
+def _with_pipelining_forward_step(model, tokens, position_ids, attention_mask,
+                                  inference_params, micro_batch_size):
+    """No interleaving is supported."""
+    sequence_length = tokens.size(1)
+    batch_size = tokens.size(0)
+    # Divide the batch dimension into micro batches.
+    num_micro_batches, last_chunk = divmod(batch_size,
+                                           micro_batch_size)
+    if last_chunk > 0:
+        num_micro_batches += 1
+    # Preallocate memory for output logits.
+    logits = None
+    if mpu.is_pipeline_last_stage():
+        args = get_args()
+        logits = torch.empty(
+            (batch_size, sequence_length, args.padded_vocab_size),
+            dtype=torch.float32, device=torch.cuda.current_device())
+    # Preallocate recv buffer.
+    recv_buffer = _allocate_recv_buffer(micro_batch_size, sequence_length)
+    for micro_batch_index in range(num_micro_batches):
+        # Slice among the batch dimenion.
+        start = micro_batch_index * micro_batch_size
+        end = min(start + micro_batch_size, batch_size)
+        this_micro_batch_size = end - start
+        tokens2use = tokens[start:end, ...]
+        position_ids2use = position_ids[start:end, ...]
+        # Run a simple forward pass.
+        if this_micro_batch_size != micro_batch_size:
+            recv_buffer = None
+        output = _forward_step_helper(model, tokens2use, position_ids2use,
+                                      attention_mask, inference_params,
+                                      recv_buffer=recv_buffer)
+        # Adjust the batch size offset to account for the micro-batch.
+        inference_params.batch_size_offset += this_micro_batch_size
+        # Copy logits.
+        if mpu.is_pipeline_last_stage():
+            logits[start:end, ...] = output
+    # Once we are done with all the micro-batches, we can
+    # adjust the sequence length offset.
+    inference_params.sequence_len_offset += sequence_length
+    # and reset the batch size offset
+    inference_params.batch_size_offset = 0
+    return logits
--- a/megatron/text_generation/generation.py
+++ b/megatron/text_generation/generation.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generation utilities."""
+import torch
+import torch.nn.functional as F
+from megatron import get_args, get_tokenizer, mpu
+from megatron.utils import get_ltor_masks_and_position_ids
+from .communication import (
+    copy_from_last_to_first_pipeline_stage,
+    broadcast_from_last_pipeline_stage,
+    broadcast_from_last_to_first_pipeline_stage)
+from .forward_step import ForwardStep
+from .sampling import sample
+def score_and_return_on_first_stage(model, tokens, lengths):
+    """Function for just scoring.
+    Arguments:
+        model: no interleaving is supported.
+        tokens: prompt tokens extended to be of size [b, max_prompt_length]
+        lengths: original prompt length, size: [b]
+    Note: Outside of model, other parameters only need to be available on
+          rank 0.
+    Outputs: 
+        output_log_probs: log probability of the selected tokens. size: [b, s]
+    """
+    args = get_args()
+    batch_size = tokens.size(0)
+    max_prompt_length = lengths.max().item()
+    assert max_prompt_length == tokens.size(1)
+    max_sequence_length = min(max_prompt_length, args.max_position_embeddings)
+    # forward step.
+    forward_step = ForwardStep(model, batch_size, max_sequence_length)
+    # ===================
+    # Pre-allocate memory
+    # ===================
+    # Log probability of the sequence (prompt + generated tokens).
+    output_log_probs = None
+    output_log_probs_size = (batch_size, max_sequence_length - 1)
+    if mpu.is_pipeline_last_stage():
+        output_log_probs = torch.empty(output_log_probs_size,
+                                       dtype=torch.float32,
+                                       device=torch.cuda.current_device())
+    # =============
+    # Run infernece
+    # =============
+    with torch.no_grad():
+        attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens)
+        # logits will be meanigful only in the last pipeline stage.
+        logits = forward_step(tokens, position_ids, attention_mask)
+        if mpu.is_pipeline_last_stage():
+            # Always the last stage should have an output.
+            assert logits is not None
+            log_probs = F.log_softmax(logits, dim=2)
+            # Pick the tokens that we need to get the log
+            # probabilities for. Note that next input token is
+            # the token which we selected in the current logits,
+            # so shift by 1.
+            indices = torch.unsqueeze(tokens[:, 1:], 2)
+            output_log_probs = torch.gather(log_probs, 2, indices).squeeze(2)
+    # ======================================
+    # Broadcast to the first pipeline stage.
+    # ======================================
+    output_log_probs = broadcast_from_last_to_first_pipeline_stage(
+        output_log_probs_size, torch.float32, output_log_probs)
+    return tokens, lengths, output_log_probs
+def generate_tokens_probs_and_return_on_first_stage(
+        model, tokens, lengths,
+        return_output_log_probs=False,
+        top_k=0, top_p=0.0,
+        temperature=1.0,
+        use_eod_token_for_early_termination=True):
+    """Main token generation function.
+    Arguments:
+        model: no interleaving is supported.
+        tokens: prompt tokens extended to be of size [b, max-sequence-length]
+        lengths: original prompt length, size: [b]
+        return_output_log_probs: flag to calculate the log probability of
+            the generated tokens. Note that the log probability is the one
+            from the original logit.
+        top_k, top_p: top-k and top-p sampling parameters.
+            Note that top-k = 1 is gready. Also, these paramters are
+            exclusive meaning that:
+                if top-k > 0 then we expect top-p=0.
+                if top-p > 0 then we check for top-k=0.
+        temperature: sampling temperature.
+        use_eod_token_for_early_termination: if True, do early termination if
+            all the sequences have reached this token.
+    Note: Outside of model, other parameters only need to be available on
+          rank 0.
+    Outputs: Note that is size is adjusted to a lower value than
+             max-sequence-length if generation is terminated early.
+        tokens: prompt and generated tokens. size: [b, :]
+        generated_sequence_lengths: total length (including prompt) of
+            the generated sequence. size: [b]
+        output_log_probs: log probability of the selected tokens. size: [b, s]
+    """
+    args = get_args()
+    tokenizer = get_tokenizer()
+    batch_size = tokens.size(0)
+    min_prompt_length = lengths.min().item()
+    max_sequence_length = tokens.size(1)
+    max_sequence_length = min(max_sequence_length, args.max_position_embeddings)
+    # forward step.
+    forward_step = ForwardStep(model, batch_size, max_sequence_length)
+    # Added termination_id to support the case that we want to terminate the
+    # generation once that id is generated.
+    if hasattr(args, 'eos_id'):
+        termination_id = args.eos_id
+    else:
+        termination_id = tokenizer.eod
+    # ===================
+    # Pre-allocate memory
+    # ===================
+    # Log probability of the sequence (prompt + generated tokens).
+    output_log_probs = None
+    output_log_probs_size = (batch_size, max_sequence_length - 1)
+    # Lengths of generated seuquence including including prompts.
+    generated_sequence_lengths = None
+    if mpu.is_pipeline_last_stage():
+        if return_output_log_probs:
+            output_log_probs = torch.empty(output_log_probs_size,
+                                           dtype=torch.float32,
+                                           device=torch.cuda.current_device())
+        generated_sequence_lengths = torch.ones(
+                batch_size, dtype=torch.int64,
+                device=torch.cuda.current_device()) * max_sequence_length
+    # Whether we have reached a termination id.
+    is_generation_done = torch.zeros(batch_size, dtype=torch.uint8,
+                                     device=torch.cuda.current_device())
+    # =============
+    # Run infernece
+    # =============
+    with torch.no_grad():
+        attention_mask, position_ids = _build_attention_mask_and_position_ids(
+            tokens)
+        prev_context_length = 0
+        for context_length in range(min_prompt_length, max_sequence_length):
+            # Pick the slice that we need to pass through the network.
+            tokens2use = tokens[:, prev_context_length:context_length]
+            positions2use = position_ids[:, prev_context_length:context_length]
+            attention_mask2use = attention_mask[
+                ..., prev_context_length:context_length, :context_length]
+            # logits will be meanigful only in the last pipeline stage.
+            logits = forward_step(tokens2use, positions2use, attention_mask2use)
+            if mpu.is_pipeline_last_stage():
+                # Always the last stage should have an output.
+                assert logits is not None
+                # Sample.
+                last_token_logits = logits[:, -1, :]
+                new_sample = sample(last_token_logits,
+                                    top_k=top_k,
+                                    top_p=top_p,
+                                    temperature=temperature,
+                                    vocab_size=tokenizer.vocab_size)
+                # If a prompt length is smaller or equal th current context
+                # length, it means we have started generating tokens
+                started = lengths <= context_length
+                # Update the tokens.
+                tokens[started, context_length] = new_sample[started]
+                # Calculate the log probabilities.
+                if return_output_log_probs:
+                    log_probs = F.log_softmax(logits, dim=2)
+                    if return_output_log_probs:
+                        # Pick the tokens that we need to get the log
+                        # probabilities for. Note that next input token is
+                        # the token which we selected in the current logits,
+                        # so shift by 1.
+                        indices = torch.unsqueeze(
+                            tokens[
+                                :,
+                                (prev_context_length + 1):(context_length + 1)],
+                            2)
+                        output_log_probs[:,
+                                         prev_context_length:context_length] = \
+                            torch.gather(log_probs, 2, indices).squeeze(2)
+            # Update the tokens on the first stage so the next input to
+            # the network is correct.
+            copy_from_last_to_first_pipeline_stage(batch_size, torch.int64,
+                                                   tokens[:, context_length])
+            # Update the context length for the next token generation.
+            prev_context_length = context_length
+            # Check if all the sequences have hit the termination_id.
+            done = None
+            if mpu.is_pipeline_last_stage():
+                done_token = (new_sample == termination_id).byte() & \
+                    started.byte()
+                just_finished = (done_token & ~is_generation_done).bool()
+                generated_sequence_lengths[just_finished.view(-1)] = \
+                    context_length + 1
+                is_generation_done = is_generation_done | done_token
+                done = torch.all(is_generation_done)
+            done = broadcast_from_last_pipeline_stage(1, torch.uint8,
+                                                      tensor=done)
+            if use_eod_token_for_early_termination and done:
+                break
+    # ===================================================
+    # Update the length of based on max generated length.
+    # ===================================================
+    tokens = tokens[:, :(context_length + 1)]
+    if mpu.is_pipeline_last_stage():
+        if return_output_log_probs:
+            output_log_probs = output_log_probs[:, :context_length]
+    # ======================================
+    # Broadcast to the first pipeline stage.
+    # ======================================
+    generated_sequence_lengths = broadcast_from_last_to_first_pipeline_stage(
+        batch_size, torch.int64, generated_sequence_lengths)
+    if return_output_log_probs:
+        output_log_probs_size = (batch_size, context_length)
+        output_log_probs = broadcast_from_last_to_first_pipeline_stage(
+            output_log_probs_size, torch.float32, output_log_probs)
+    return tokens, generated_sequence_lengths, output_log_probs
+def _build_attention_mask_and_position_ids(tokens):
+    """Build the attention mask and postition ids for the input tokens."""
+    # Since we are not interested in loss-mask and reset attention/position
+    # is also False, eod_token is not used so it is safe to set it to None.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        data=tokens,
+        eod_token=None,
+        reset_position_ids=False,
+        reset_attention_mask=False,
+        eod_mask_loss=False)
+    return attention_mask, position_ids
--- a/megatron/text_generation/sampling.py
+++ b/megatron/text_generation/sampling.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sampling utilities.
+Part of this code is inspired by:
+ - https://github.com/ari-holtzman/degen/blob/master/gen.py
+ - https://huggingface.co/transformers/_modules/transformers/generation_logits_process.html
+"""
+import torch
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf."""
+    filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(filter_, float('-Inf'))
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf."""
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Filteration based on the cumulative sum.
+    filter_ = cumulative_probs > top_p
+    # This shift by 1 is weird and I cannot justify it. This existed
+    # in the original implementation:
+    #   https://github.com/ari-holtzman/degen/blob/master/gen.py
+    # and I guess it is needed so keeping it for now.
+    filter_[:, 1:] = filter_[:, :-1].clone()
+    # Make sure we at least have one token to select from.
+    filter_[..., 0] = 0
+    # Fill in the filtered part
+    filter_ = filter_.scatter(1, sorted_indices, filter_)
+    logits.masked_fill_(filter_, float('-Inf'))
+def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None):
+    """ Sample and generate a token.
+    Note: logits has the dimension [b, v] where b is the batch size
+          and v is the vocabulary size.
+    If vocab_size is provided, we will make sure the sample that is
+    generated is in [0, vocab-size). This will avoid out of vocabulary
+    generations due to padding.
+    """
+    # Check logits for consistency.
+    assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.'
+    assert logits.type() == 'torch.cuda.FloatTensor', \
+        'input logits should be floats.'
+    # Greedy is just simple argmax.
+    if top_k == 1:
+        assert top_p == 0.0, 'cannot set both greedy and top-p samplings.'
+        samples = torch.argmax(logits, dim=-1)
+    # Top-k or top-p sampling.
+    else:
+        # Clone so we do not modify the inputs,
+        logits = logits.clone()
+        # Apply temperature in place.
+        if temperature != 1.0:
+            logits.div_(temperature)
+        if top_k > 1:
+            assert top_p == 0.0, 'cannot set both top-k and top-p samplings.'
+            assert top_k <= logits.size(1), 'top-k is larger than logit size.'
+            if vocab_size:
+                assert top_k < vocab_size, 'top-k is larger than vocab size.'
+            modify_logits_for_top_k_filtering(logits, top_k)
+        elif top_p > 0.0:
+            assert top_p <= 1.0, 'top-p should be in (0, 1].'
+            modify_logits_for_top_p_filtering(logits, top_p)
+        # After filtering, we need to recalculate the distribution.
+        probs = logits.softmax(dim=-1)
+        samples = torch.multinomial(probs, num_samples=1).view(-1)
+    # If vocab size is provided, make sure the samples are in
+    # in the range [0, vocab-size).
+    if vocab_size:
+        samples = torch.clamp(samples, min=0, max=(vocab_size - 1))
+    return samples
--- a/megatron/text_generation/tokenization.py
+++ b/megatron/text_generation/tokenization.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization utilities."""
+import torch
+from megatron import get_tokenizer
+from .communication import broadcast_int_list, broadcast_tensor
+def detokenize_generations(tokens_gpu_tensor,
+                           lengths_gpu_tensor,
+                           return_segments):
+    """Detokenize the generated tokens."""
+    tokenizer = get_tokenizer()
+    prompts_plus_generations = []
+    if return_segments:
+        prompts_plus_generations_segments = []
+    tokens = tokens_gpu_tensor.cpu().numpy().tolist()
+    lengths = lengths_gpu_tensor.cpu().numpy().tolist()
+    for sequence_tokens, length in zip(tokens, lengths):
+        sequence_tokens = sequence_tokens[:length]
+        prompts_plus_generations.append(
+            tokenizer.detokenize(sequence_tokens))
+        if return_segments:
+            words = []
+            for token in sequence_tokens:
+                word = tokenizer.tokenizer.decoder[token]
+                word = bytearray(
+                    [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
+                        'utf-8', errors='replace')
+                words.append(word)
+            prompts_plus_generations_segments.append(words)
+    if return_segments:
+        return tokens, prompts_plus_generations, \
+            prompts_plus_generations_segments
+    return tokens, prompts_plus_generations
+def tokenize_prompts(prompts=None, tokens_to_generate=None,
+                     add_BOS=None, rank=0):
+    """Tokenize prompts and make them avaiable on all ranks."""
+    # On all ranks set to None so we can pass them to functions
+    sizes_list = None
+    prompts_tokens_cuda_long_tensor = None
+    prompts_length_cuda_long_tensor = None
+    # On the specified rank, build the above.
+    if torch.distributed.get_rank() == rank:
+        assert prompts is not None
+        assert tokens_to_generate is not None
+        # Tensor of tokens padded and their unpadded length.
+        prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \
+            _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS)
+        # We need the sizes of these tensors for the boradcast
+        sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size
+                      prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
+    # First, broadcast the sizes.
+    sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank)
+    # Now that we have the sizes, we can boradcast the tokens
+    # and length tensors.
+    sizes = sizes_tensor.tolist()
+    prompts_tokens_cuda_long_tensor = broadcast_tensor(
+        sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank)
+    prompts_length_cuda_long_tensor = broadcast_tensor(
+        sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor,
+        rank=rank)
+    return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
+def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS):
+    """Given a set of prompts and number of tokens to generate:
+        - tokenize prompts
+        - set the sequence length to be the max of length of prompts
+          plus the number of tokens we would like to generate
+        - pad all the sequences to this length so we can convert them
+          into a 2D tensor.
+    """
+    # Tokenize all the prompts.
+    tokenizer = get_tokenizer()
+    if add_BOS:
+        prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt)
+                          for prompt in prompts]
+    else:
+        prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
+    # Now we have a list of list of tokens which each list has a different
+    # size. We want to extend this list to:
+    #   - incorporate the tokens that need to be generated
+    #   - make all the sequences equal length.
+    # Get the prompts length.
+    prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
+    # Get the max prompts length.
+    max_prompt_len = max(prompts_length)
+    # Number of tokens in the each sample of the batch.
+    samples_length = max_prompt_len + tokens_to_generate
+    # Now update the list of list to be of the same size: samples_length.
+    for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
+        padding_size = samples_length - prompt_length
+        prompt_tokens.extend([tokenizer.eod] * padding_size)
+    # Now we are in a structured format, we can convert to tensors.
+    prompts_tokens_tensor = torch.cuda.LongTensor(prompts_tokens)
+    prompts_length_tensor = torch.cuda.LongTensor(prompts_length)
+    return prompts_tokens_tensor, prompts_length_tensor
--- a/megatron/text_generation_server.py
+++ b/megatron/text_generation_server.py
@@ -19,8 +19,8 @@ import threading
 from flask import Flask, request, jsonify, current_app
 from flask_restful import Resource, Api
 from megatron import get_args
-from megatron import mpu
+from megatron.text_generation import generate_and_post_process
-from megatron.text_generation_utils import generate
 GENERATE_NUM = 0
 lock = threading.Lock()
@@ -39,26 +39,38 @@ class MegatronGenerate(Resource):
        print("request IP: " + str(request.remote_addr))
        print(json.dumps(request.get_json()),flush=True)
        print("current time: ", datetime.datetime.now())
+        if not "prompts" in request.get_json():
+            return "prompts argument required", 400
+        if "max_len" in request.get_json():
+            return "max_len is no longer used.  Replace with tokens_to_generate", 400
-        sentences = request.get_json()["sentences"]
+        if "sentences" in request.get_json():
-        if len(sentences) > 128:
+            return "sentences is no longer used.  Replace with prompts", 400
-            return "Maximum number of sentences is 128", 400
+        prompts = request.get_json()["prompts"]
+        if len(prompts) > 128:
+            return "Maximum number of prompts is 128", 400
        tokens_to_generate = 64  # Choosing hopefully sane default.  Full sequence is slow
        if "tokens_to_generate" in request.get_json():
            tokens_to_generate = request.get_json()["tokens_to_generate"]
            if not isinstance(tokens_to_generate, int):
                return "tokens_to_generate must be an integer greater than 0"
-            if tokens_to_generate < 1:
+            if tokens_to_generate < 0:
-                return "tokens_to_generate must be an integer greater than 0"
+                return "tokens_to_generate must be an integer greater than or equal to 0"
-        all_probs = False
+        logprobs = False
-        if "all_probs" in request.get_json():
+        if "logprobs" in request.get_json():
-            all_probs = request.get_json()["all_probs"]
+            logprobs = request.get_json()["logprobs"]
-            if not isinstance(all_probs, bool):
+            if not isinstance(logprobs, bool):
-                return "all_probs must be a boolean value"
+                return "logprobs must be a boolean value"
-        temperature = args.temperature
+        if tokens_to_generate == 0 and not logprobs:
+            return "tokens_to_generate=0 implies logprobs should be True"
+        temperature = 1.0
        if "temperature" in request.get_json():
            temperature = request.get_json()["temperature"]
            if not (type(temperature) == int or type(temperature) == float):
@@ -66,6 +78,24 @@ class MegatronGenerate(Resource):
            if not (0.0 < temperature <= 100.0):
                return "temperature must be a positive number less than or equal to 100.0"
+        top_k = 0.0
+        if "top_k" in request.get_json():
+            top_k = request.get_json()["top_k"]
+            if not (type(top_k) == int):
+                return "top_k must be an integer equal to or greater than 0 and less than or equal to 1000"
+            if not (0 <= top_k <= 1000):
+                return "top_k must be equal to or greater than 0 and less than or equal to 1000"
+        top_p = 0.0
+        if "top_p" in request.get_json():
+            top_p = request.get_json()["top_p"]
+            if not (type(top_p) == float):
+                return "top_p must be a positive float less than or equal to 1.0"
+            if top_p > 0.0 and top_k > 0.0:
+                return "cannot set both top-k and top-p samplings."
+            if not (0 <= top_p <= 1.0):
+                return "top_p must be less than or equal to 1.0"
        add_BOS = False
        if "add_BOS" in request.get_json():
            add_BOS = request.get_json()["add_BOS"]
@@ -74,24 +104,27 @@ class MegatronGenerate(Resource):
        with lock:  # Need to get lock to keep multiple threads from hitting code
            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
-            resp_sentences, resp_sentences_seg, output_logits, full_logits, tokens = generate(self.model, sentences, tokens_to_generate, all_probs, temperature, add_BOS) 
+            response, response_seg, response_logprobs, _ = \
+                generate_and_post_process(
-        if all_probs:
+                    self.model,
-            return jsonify({"sentences": resp_sentences,
+                    prompts=prompts,
-                "segments": resp_sentences_seg,
+                    tokens_to_generate=tokens_to_generate,
-                "logits": output_logits,
+                    return_output_log_probs=logprobs,
-                "all_logits": full_logits,
+                    top_k_sampling=top_k,
-                "tokens": tokens})
+                    top_p_sampling=top_p,
+                    temperature=temperature,
+                    add_BOS=add_BOS,
+                    use_eod_token_for_early_termination=True)
-        return jsonify({"sentences": resp_sentences,
+        return jsonify({"text": response,
-            "segments": resp_sentences_seg,
+            "segments": response_seg,
-            "logits": output_logits})
+            "logprobs": response_logprobs})
 class MegatronServer(object):
    def __init__(self, model):
        self.app = Flask(__name__, static_url_path='')
        api = Api(self.app)
-        api.add_resource(MegatronGenerate, '/generate', resource_class_args=[model])
+        api.add_resource(MegatronGenerate, '/api', resource_class_args=[model])
    def run(self, url): 
        self.app.run(url, threaded=True, debug=False)
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for generating text."""
-import copy
-import json
-import os
-import time
-import torch
-import torch.nn.functional as F
-from megatron import get_args
-from megatron import get_tokenizer
-from megatron import mpu
-from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
-from megatron.p2p_communication import recv_forward, send_forward
-# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron.model import DistributedDataParallel as LocalDDP
-from megatron.model import Float16Module
-def get_batch(context_tokens):
-    """Generate batch from context tokens."""
-    args = get_args()
-    tokenizer = get_tokenizer()
-    # Move to GPU.
-    tokens = context_tokens.contiguous().cuda()
-    # Get the attention mask and postition ids.
-    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
-    return tokens, attention_mask, position_ids
-def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
-    """ This function has been mostly taken from huggingface conversational
-     ai code at
-         https://medium.com/huggingface/how-to-build-a-state-of-the-art-
-              conversational-ai-with-transfer-learning-2d818ac26313 """
-    if top_k > 0:
-        # Remove all tokens with a probability less than the
-        # last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        logits[indices_to_remove] = filter_value
-    if top_p > 0.0:
-        # Cconvert to 1D
-        sorted_logits, sorted_indices = torch.sort(
-            logits, descending=True, dim=-1)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1),
-                                        dim=-1)
-        # Remove tokens with cumulative probability above the threshold
-        sorted_indices_to_remove = cumulative_probs > top_p
-        # Shift the indices to the right to keep also the first token
-        # above the threshold
-        sorted_indices_to_remove[..., 1:] \
-            = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-        for i in range(sorted_indices.size(0)):
-            indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]]
-            logits[i][indices_to_remove] = filter_value
-    return logits
-def pad_batch(batch, pad_id, max_len):
-    context_lengths = []
-    max_context_length = max([len(tokens) for tokens in batch])
-    for tokens in batch:
-        context_length = len(tokens)
-        if context_length < max_context_length + max_len:
-            tokens.extend([pad_id] * (max_context_length + max_len - context_length))
-        context_lengths.append(context_length)
-    return batch, context_lengths
-def tokenize_batch(sentences, max_len, add_BOS):
-    args = get_args()
-    tokenizer = get_tokenizer()
-    if add_BOS:
-        context_tokens = [[tokenizer.eod] + tokenizer.tokenize(s) for s in sentences]
-    else:
-        context_tokens = [tokenizer.tokenize(s) for s in sentences]
-    context_tokens, context_lengths = pad_batch(context_tokens,
-                                                tokenizer.eod, max_len)
-    context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
-    context_length_tensor = torch.cuda.LongTensor(context_lengths)
-    return context_tokens_tensor, context_length_tensor 
-def send_generate_info(context_tokens_tensor, context_length_tensor, tokens_to_generate, all_probs, temperature):
-    """
-    Needs to be synced up with receive_generate_info
-    """
-    # Send the sizes of the tensors
-    input_info = [context_tokens_tensor.size(0), context_tokens_tensor.size(1), tokens_to_generate, all_probs, temperature]
-    input_info_tensor = torch.cuda.FloatTensor(input_info)
-    torch.distributed.broadcast(input_info_tensor, 0)
-    # Send variables to all ranks 
-    torch.distributed.broadcast(context_length_tensor, 0)
-    torch.distributed.broadcast(context_tokens_tensor, 0)
-def receive_generate_info():
-    """
-    Needs to be synced up with send_generate_info
-    """
-    input_info_tensor = torch.empty(5, dtype=torch.float32, device=torch.cuda.current_device())
-    torch.distributed.broadcast(input_info_tensor, 0)
-    batch_size = int(input_info_tensor[0].item())
-    seq_len = int(input_info_tensor[1].item())
-    tokens_to_generate = int(input_info_tensor[2].item())
-    all_probs = int(input_info_tensor[3].item())
-    temperature = float(input_info_tensor[4].item())
-    context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
-    context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device())
-    # Send variables to all ranks 
-    torch.distributed.broadcast(context_length_tensor, 0)
-    torch.distributed.broadcast(context_tokens_tensor, 0)
-    return context_length_tensor, context_tokens_tensor, tokens_to_generate, all_probs, temperature
-def synced_generate(model, context_tokens_tensor, context_length_tensor, tokens_to_generate, all_probs, temperature):
-    context_length = context_length_tensor.min().item()
-    tokens, attention_mask, position_ids = get_batch(context_tokens_tensor)
-    batch_token_iterator = sample_sequence_batch(model, context_tokens_tensor,
-                                                 context_length_tensor,
-                                                 attention_mask, position_ids,
-                                                 tokens_to_generate,
-                                                 all_probs,
-                                                 temperature=temperature)
-    for tokens, lengths, output_logits, full_logits in batch_token_iterator:
-        context_length += 1
-    if mpu.is_pipeline_last_stage():
-        src = mpu.get_pipeline_model_parallel_last_rank()
-        group = mpu.get_embedding_group()
-        torch.distributed.broadcast(output_logits, src, group)
-        if all_probs:
-            src = mpu.get_pipeline_model_parallel_last_rank()
-            group = mpu.get_embedding_group()
-            torch.distributed.broadcast(full_logits, src, group)
-    else:
-        if mpu.is_pipeline_first_stage():
-            src = mpu.get_pipeline_model_parallel_last_rank()
-            group = mpu.get_embedding_group()
-            output_logits = torch.empty(tokens.size(0), context_length-1, dtype=torch.float32, device=torch.device("cuda"))
-            torch.distributed.broadcast(output_logits, src, group)
-            if all_probs:
-                args = get_args()
-                src = mpu.get_pipeline_model_parallel_last_rank()
-                group = mpu.get_embedding_group()
-                full_logits = torch.empty(tokens.size(0), context_length, args.padded_vocab_size, dtype=torch.float32, device=torch.device("cuda"))
-                torch.distributed.broadcast(full_logits, src, group)
-    if tokens is not None:
-        return tokens[:, :context_length], output_logits, full_logits 
-def generate(model, sentences=None, tokens_to_generate=0, all_probs=False, temperature=1.0, add_BOS=False):
-    model.eval()
-    if torch.distributed.get_rank() == 0:
-        context_tokens_tensor, context_length_tensor = tokenize_batch(sentences, tokens_to_generate, add_BOS)
-        send_generate_info(context_tokens_tensor, context_length_tensor, tokens_to_generate, all_probs, temperature)
-    else:
-        context_length_tensor, context_tokens_tensor, tokens_to_generate, all_probs, temperature = receive_generate_info()
-    output = synced_generate(model, context_tokens_tensor, context_length_tensor, tokens_to_generate, all_probs, temperature)
-    if output is not None:
-        decode_tokens, output_logits, full_logits = output
-        args = get_args()
-        tokenizer = get_tokenizer()
-        resp_sentences = []
-        resp_sentences_seg = []
-        decode_tokens = decode_tokens.cpu().numpy().tolist()
-        for decode_token in decode_tokens:
-            resp_sentences.append(tokenizer.detokenize(decode_token))
-            words = []
-            for token in decode_token:
-                word = tokenizer.tokenizer.decoder[token]
-                word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode('utf-8', errors='replace')
-                words.append(word)
-            resp_sentences_seg.append(words)
-        output_logits = output_logits.cpu().numpy().tolist()
-        if all_probs:
-            full_logits = full_logits.cpu().numpy().tolist()
-        return resp_sentences, resp_sentences_seg, output_logits, full_logits, decode_tokens 
-def generate_samples_eval(model, context, max_gen_length, eos_token_id):
-    """
-    This function is here to provide an a matching API for a legacy task
-    This implementation hasn't been tested yet to make sure it matches
-    """
-    #assert False, "Implementation untested"
-    args = get_args()
-    args.eos_id = eos_token_id
-    raw_text_len = len(context)
-    resp_sentences = generate(model, [context], max_gen_length)
-    if resp_sentences:
-        return resp_sentences[0][raw_text_len:]
-def switch(val1, val2, boolean):
-    boolean = boolean.type_as(val1)
-    return (1 - boolean) * val1 + boolean * val2
-def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids,
-                 set_inference_key_value_memory=False,
-                 inference_max_sequence_len=None):
-    # Hidden size changes when not using recompute, need to tell p2p_communicate
-    # functions the correct size
-    args = get_args()
-    orig_seq_length = args.seq_length
-    args.seq_length = tokens.shape[1]
-    args.micro_batch_size = tokens.shape[0]
-    input_tensor = recv_forward()
-    # Forward pass through the model.
-    unwrapped_model = unwrap_model(
-        model, (torchDDP, LocalDDP, Float16Module))
-    unwrapped_model.set_input_tensor(input_tensor)
-    output_tensor = model(
-        tokens, position_ids, attention_mask,
-        tokentype_ids=tokentype_ids,
-        set_inference_key_value_memory=set_inference_key_value_memory,
-        inference_max_sequence_len=inference_max_sequence_len)
-    send_forward(output_tensor)
-    args.seq_length = orig_seq_length
-    return output_tensor
-def sample_sequence_batch(model, context_tokens, context_lengths,
-                          attention_mask, position_ids,
-                          tokens_to_generate, all_probs=False, type_ids=None, temperature=None):
-    args = get_args()
-    tokenizer = get_tokenizer()
-    model.eval()
-    with torch.no_grad():
-        context_length = context_lengths.min().item()
-        # added eos_id to support the function generate_samples_eval that passes
-        # eos_id as an argument and needs termination when that id id found.
-        if hasattr(args, 'eos_id'):
-            eos_id = args.eos_id
-        else:
-            eos_id = tokenizer.eod
-        counter = 0
-        batch_size = context_tokens.size(0)
-        is_done = torch.zeros([batch_size]).byte().cuda()
-        tokens = context_tokens
-        output_logits = None
-        # Generate enough tokens for the longest sequence
-        maxlen = tokens_to_generate + context_lengths.max().item() 
-        if maxlen > args.seq_length:
-            maxlen = args.seq_length
-        lengths = torch.ones([batch_size]).long().cuda() * maxlen
-        while context_length < maxlen:
-            types2use = None
-            if counter == 0:
-                # Allocate memory for the entire context.
-                set_inference_key_value_memory = True
-                tokens2use = tokens[:, :context_length]
-                positions2use = position_ids[:, :context_length]
-                if type_ids is not None:
-                    types2use = type_ids[:, :context_length]
-            else:
-                # Set this to false so the memory is not reallocated.
-                set_inference_key_value_memory = False
-                tokens2use = tokens[:, context_length - 1].view(
-                    batch_size, -1)
-                positions2use = position_ids[:, context_length - 1].view(
-                    batch_size, -1)
-                if type_ids is not None:
-                    types2use = type_ids[:, context_length - 1].view(
-                        batch_size, -1)
-            output = forward_step(
-                model, tokens2use,
-                positions2use,
-                attention_mask,
-                set_inference_key_value_memory=set_inference_key_value_memory,
-                inference_max_sequence_len=maxlen,
-                tokentype_ids=types2use)
-            if mpu.is_pipeline_last_stage():
-                assert output is not None
-                output = output.float()
-                logits = output[:, -1].view(batch_size, -1).contiguous()
-                if args.greedy:
-                    prev = torch.argmax(logits, dim=-1).view(-1)
-                else:
-                    logits = logits.float()
-                    logits /= temperature
-                    logits = top_k_logits(logits, top_k=args.top_k,
-                                          top_p=args.top_p)
-                    log_probs = F.softmax(logits, dim=-1)
-                    prev = torch.multinomial(log_probs, num_samples=1).view(-1)
-                started = context_lengths <= context_length
-                # Clamp the out of vocabulary tokens.
-                tokenizer = get_tokenizer()
-                prev = torch.clamp(prev, max=tokenizer.vocab_size - 1)
-                new_tokens = switch(
-                    tokens[:, context_length].view(-1), prev, started)
-                tokens[:, context_length] = new_tokens
-                if output_logits is None:
-                    output_context = F.log_softmax(output[:, :context_length, :], 2)
-                    indices = torch.unsqueeze(tokens[:, 1:context_length+1],2)
-                    output_logits = torch.gather(output_context, 2, indices).squeeze(2)
-                    if all_probs:
-                        full_logits = output_context
-                else:
-                    output_context = F.log_softmax(output, 2)
-                    indices = torch.unsqueeze(new_tokens,1).unsqueeze(2)
-                    new_output_logits = torch.gather(output_context, 2, indices).squeeze(2)
-                    # TODO(rprenger) we're copying output_logits every time.  Should pre-allocate
-                    output_logits = torch.cat([output_logits, new_output_logits],1)
-                    if all_probs:
-                        full_logits = torch.cat([full_logits, output_context], 1)
-                src = mpu.get_pipeline_model_parallel_last_rank()
-                group = mpu.get_embedding_group()
-                torch.distributed.broadcast(new_tokens, src, group)
-                done_token = (prev == eos_id).byte() & started.byte()
-                just_finished = (done_token & ~is_done).bool()
-                lengths[just_finished.view(-1)] = context_length
-                is_done = is_done | done_token
-                done = torch.all(is_done)
-                src = mpu.get_pipeline_model_parallel_last_rank()
-                group = mpu.get_pipeline_model_parallel_group()
-                torch.distributed.broadcast(done, src, group)
-                if all_probs:
-                    yield tokens, lengths, output_logits, full_logits
-                else:
-                    yield tokens, lengths, output_logits, None
-            else:
-                if mpu.is_pipeline_first_stage():
-                    src = mpu.get_pipeline_model_parallel_last_rank()
-                    group = mpu.get_embedding_group()
-                    new_tokens = torch.empty_like(tokens[:, context_length])
-                    torch.distributed.broadcast(new_tokens, src, group)
-                    tokens[:, context_length] = new_tokens
-                    yield tokens, None, None, None
-                else:
-                    yield None, None, None, None
-                done = torch.cuda.ByteTensor([0])
-                src = mpu.get_pipeline_model_parallel_last_rank()
-                group = mpu.get_pipeline_model_parallel_group()
-                torch.distributed.broadcast(done, src, group)
-            context_length += 1
-            counter += 1
-            if done:
-                break
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -26,6 +26,7 @@ import torch
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from megatron import get_args
+from megatron import get_signal_handler
 from megatron import get_timers
 from megatron import get_tensorboard_writer
 from megatron import get_current_global_batch_size
@@ -426,6 +427,20 @@ def train_step(forward_step_func, data_iterator,
            else:
                grad = word_embeddings_weight.grad
            torch.distributed.all_reduce(grad, group=mpu.get_embedding_group())
+    # All-reduce position_embeddings grad across first (encoder) and split (decoder) 
+    # stages to ensure that position embeddings parameters stay in sync.
+    # This should only run for T5 models with pipeline parallelism
+    if mpu.is_rank_in_position_embedding_group() and \
+            mpu.get_pipeline_model_parallel_world_size() > 1 and \
+            args.pipeline_model_parallel_split_rank is not None:
+        unwrapped_model = model[0]
+        unwrapped_model = unwrap_model(
+            unwrapped_model, (torchDDP, LocalDDP, Float16Module))
+        assert args.DDP_impl == 'local', \
+            'T5 model is only supported with local DDP mode'
+        grad = unwrapped_model.language_model.embedding.position_embeddings.weight.main_grad
+        torch.distributed.all_reduce(grad, group=mpu.get_position_embedding_group())
    timers('backward-embedding-all-reduce').stop()
    # Update parameters.
@@ -544,6 +559,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
            writer.add_scalar('loss-scale', loss_scale, iteration)
            writer.add_scalar('loss-scale vs samples', loss_scale,
                              args.consumed_train_samples)
+        if args.log_world_size_to_tensorboard:
+            writer.add_scalar('world-size', args.world_size, iteration)
+            writer.add_scalar('world-size vs samples', args.world_size,
+                              args.consumed_train_samples)
        if grad_norm is not None:
            writer.add_scalar('grad-norm', grad_norm, iteration)
            writer.add_scalar('grad-norm vs samples', grad_norm,
@@ -698,6 +717,14 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
        # Checkpointing
        saved_checkpoint = False
+        if args.exit_signal_handler:
+            signal_handler = get_signal_handler()
+            if any(signal_handler.signals_received()):
+                save_checkpoint_and_time(iteration, model, optimizer,
+                                         lr_scheduler)
+                print_datetime('exiting program after receiving SIGTERM.')
+                sys.exit()
        if args.save and args.save_interval and \
           iteration % args.save_interval == 0:
            save_checkpoint_and_time(iteration, model, optimizer,

--- a/tasks/finetune_utils.py
+++ b/tasks/finetune_utils.py
@@ -25,6 +25,7 @@ from megatron import get_timers
 from megatron import mpu
 from megatron.checkpointing import load_checkpoint
 from megatron.checkpointing import save_checkpoint
+from megatron.model import ModelType
 from megatron.training import evaluate_and_print_results
 from megatron.training import setup_model_and_optimizer
 from megatron.training import train_step
@@ -248,6 +249,7 @@ def _train(model, optimizer, lr_scheduler, forward_step,
 def finetune(train_valid_datasets_provider, model_provider,
+             model_type=ModelType.encoder_or_decoder,
             forward_step=_cross_entropy_forward_step,
             end_of_epoch_callback_provider=None,
             task_collate_fn=None):
@@ -277,7 +279,7 @@ def finetune(train_valid_datasets_provider, model_provider,
    # Build model, optimizer and learning rate scheduler.
    timers('model and optimizer').start()
-    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider, model_type)
    timers('model and optimizer').stop()
    # If pretrained checkpoint is provided and we have not trained for

--- a/tasks/msdp/README.md
+++ b/tasks/msdp/README.md
+# Multi-Stage Prompting for Knowledgeable Dialogue Generation
+Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework.
+## Multi-Stage Dialogue Prompting
+### Data Preparation
+1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
+2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets.
+### Stage-1: Prompting for Knowledge Generation
+1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
+2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.
+### Stage-2: Prompting for Response Generation
+1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file).
+2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation.
+3.  We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation.
--- a/tasks/msdp/evaluate.py
+++ b/tasks/msdp/evaluate.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model evaluation"""
+from megatron import get_args
+from megatron import print_rank_0
+from tasks.msdp.metrics import F1Metric
+from tqdm import tqdm
+def evaluate_f1(guess_file, answer_file):
+    """Evaluating F1 Score"""
+    guess_list = []
+    print_rank_0('reading %s' % guess_file)
+    with open(guess_file, "r") as f:
+        for i, line in enumerate(tqdm(f)):
+            line = line.strip()
+            if "<|endoftext|>" in line:
+                line = line.replace("<|endoftext|>", "")
+            guess_list.append(line)
+    answer_list = []
+    print_rank_0('reading %s' % answer_file)
+    with open(answer_file, "r") as f:
+        for i, line in enumerate(tqdm(f)):
+            line = line.strip()
+            if line == "no_passages_used":
+                line = ""
+            answer_list.append(line)
+    assert len(guess_list) == len(answer_list), \
+        "lengths of guess and answer are different!"
+    precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
+    print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1))
+    print_rank_0('done :-)')
+def main():
+    args = get_args()
+    evaluate_f1(args.guess_file, args.answer_file)
--- a/tasks/msdp/main.py
+++ b/tasks/msdp/main.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run multi-stage dialogue prompting (MSDP)."""
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(
+    os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir)))
+from megatron import get_args
+from megatron.initialize import initialize_megatron
+def get_tasks_args(parser):
+    """Provide extra arguments required for tasks."""
+    group = parser.add_argument_group(title='tasks')
+    # parameters for the knowledgeable dialogue generation
+    group.add_argument('--task', type=str, required=True,
+                       help='Task name.')
+    group.add_argument("--sample-input-file", type=str, default=None,
+                       help='Get input from file instead of interactive mode, '
+                       'each line is an input.')
+    group.add_argument("--sample-output-file", type=str, default=None,
+                       help='Output file got from --sample-input-file')
+    group.add_argument('--prompt-file', type=str, default=None,
+                       help='prompting file')
+    group.add_argument('--prompt-type', type=str, default=None, 
+                       choices=['knowledge', 'response'],
+                       help='prompt type (knowledge or response)')
+    group.add_argument('--num-prompt-examples', type=int, default=10,
+                       help='number of prompt examples')
+    group.add_argument('--guess-file', type=str, default=None,
+                       help='datapath for generated sentences')
+    group.add_argument('--answer-file', type=str, default=None,
+                       help='datapath for golden sentences')
+    group.add_argument('--out-seq-length', type=int, default=100,
+                       help='output sequence length')
+    group.add_argument('--api-prompt', default=False, action="store_true",
+                       help='setup model api for prompting')
+    group.add_argument('--megatron-api-url', type=str, default=None,
+                       help='url of the megatron api')
+    return parser
+if __name__ == '__main__':
+    initialize_megatron(extra_args_provider=get_tasks_args)
+    args = get_args()
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for downstream tasks.")
+        exit()
+    if args.task == 'MSDP-PROMPT':
+        from tasks.msdp.prompt import main
+    elif args.task == 'MSDP-EVAL-F1':
+        from tasks.msdp.evaluate import main
+    else:
+        raise NotImplementedError('Task {} is not implemented.'.format(
+            args.task))
+    main()