dtk23.04初始化

bf95e032 · hepj987 · bf95e032 · bf95e032 · bf95e032 · bf95e032
Commit bf95e032 authored Jul 14, 2023 by hepj987
20 changed files
--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer based language model."""
+
+import torch
+import torch.nn.functional as F
+
+from megatron import get_args
+from megatron import mpu
+from .module import MegatronModule
+from megatron.enums import LayerType, AttnMaskType, PositionEmbeddingType
+from megatron.model.transformer import ParallelTransformer
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal, scaled_init_method_normal
+
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
+                       bias=None):
+    """LM logits using word embedding weights."""
+    # Parallel logits.
+    input_parallel = mpu.copy_to_tensor_model_parallel_region(input_)
+    # Matrix multiply.
+    if bias is None:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
+    else:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
+    # Gather if needed.
+    if parallel_output:
+        return logits_parallel
+
+    return mpu.gather_from_tensor_model_parallel_region(logits_parallel)
+
+
+def get_language_model(num_tokentypes, add_pooler,
+                       encoder_attn_mask_type, init_method=None,
+                       scaled_init_method=None, add_decoder=False,
+                       decoder_attn_mask_type=AttnMaskType.causal,
+                       pre_process=True, post_process=True):
+    """Build language model and return along with the key to save."""
+    args = get_args()
+
+    if init_method is None:
+        init_method = init_method_normal(args.init_method_std)
+
+    if scaled_init_method is None:
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+
+    # Language model.
+    language_model = TransformerLanguageModel(
+        init_method,
+        scaled_init_method,
+        encoder_attn_mask_type,
+        num_tokentypes=num_tokentypes,
+        add_decoder=add_decoder,
+        decoder_attn_mask_type=decoder_attn_mask_type,
+        add_pooler=add_pooler,
+        pre_process=pre_process,
+        post_process=post_process
+    )
+    # key used for checkpoints.
+    language_model_key = 'language_model'
+
+    return language_model, language_model_key
+
+
+class Pooler(MegatronModule):
+    """Pooler layer.
+
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+
+    def __init__(self, hidden_size, init_method):
+        super(Pooler, self).__init__()
+        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+
+    def forward(self, hidden_states, sequence_index=0):
+        # hidden_states: [b, s, h]
+        # sequence_index: index of the token to pool.
+        pooled = hidden_states[:, sequence_index, :]
+        pooled = self.dense(pooled)
+        pooled = torch.tanh(pooled)
+        return pooled
+
+
+class Embedding(MegatronModule):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 embedding_dropout_prob,
+                 init_method,
+                 num_tokentypes=0):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.init_method = init_method
+        self.num_tokentypes = num_tokentypes
+
+        args = get_args()
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, self.hidden_size,
+            init_method=self.init_method)
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embedding_type = args.position_embedding_type
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            max_position_embeddings = args.max_position_embeddings
+            assert max_position_embeddings is not None
+            self.position_embeddings = torch.nn.Embedding(
+                max_position_embeddings, self.hidden_size)
+            self._position_embeddings_key = 'position_embeddings'
+            # Initialize the position embeddings.
+            self.init_method(self.position_embeddings.weight)
+        else:
+            self.position_embeddings = None
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(
+                self.num_tokentypes, self.hidden_size)
+            # Initialize the token-type embeddings.
+            self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes),
+                  flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = self.torch.nn.Embedding(num_tokentypes,
+                                                            self.hidden_size)
+        # Initialize the token-type embeddings.
+        args = get_args()
+        self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids, tokentype_ids=None):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            assert self.position_embeddings is not None
+            embeddings = embeddings + self.position_embeddings(position_ids)
+        else:
+            assert self.position_embeddings is None
+
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        else:
+            assert self.tokentype_embeddings is None
+
+        # Dropout.
+        embeddings = self.embedding_dropout(embeddings)
+
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            state_dict_[self._position_embeddings_key] \
+                = self.position_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            if self._position_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._position_embeddings_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if 'position_embeddings' in key:
+                        state_dict_[key.split('position_embeddings.')[1]] \
+                            = state_dict[key]
+            self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_,
+                                                          strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it', flush=True)
+
+
+class EmbeddingPipe(Embedding):
+
+    def forward(self, inputs, **kwargs):
+        if not hasattr(self, '_args'):
+            self._args = get_args()
+
+        input_ids = inputs[0]
+        position_ids = inputs[1]
+        if getattr(self._args, 'pretrain_causal_attention', False):
+            attention_mask = None
+        else:
+            attention_mask = inputs[2]
+
+        if len(inputs) == 4:
+            tokentype_ids = inputs[3]
+        else:
+            tokentype_ids = None
+
+        embeddings = super().forward(input_ids, position_ids, tokentype_ids=tokentype_ids)
+
+        # If cmd args has attn_mask, we don't forward it as an activation.
+        if getattr(self._args, 'pretrain_causal_attention', False):
+            return embeddings
+        else:
+            return embeddings, attention_mask
+
+
+    @property
+    def word_embeddings_weight(self):
+        """Easy accessory for the DeepSpeed pipeline engine to tie embeddings across stages."""
+        return self.word_embeddings.weight
+
+
+class TransformerLanguageModel(MegatronModule):
+    """Transformer language model.
+
+    Arguments:
+        transformer_hparams: transformer hyperparameters
+        vocab_size: vocabulary size
+        embedding_dropout_prob: dropout probability for embeddings
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 init_method,
+                 output_layer_init_method,
+                 encoder_attn_mask_type,
+                 num_tokentypes=0,
+                 add_decoder=False,
+                 decoder_attn_mask_type=AttnMaskType.causal,
+                 add_pooler=False,
+                 pre_process=True,
+                 post_process=True):
+        super(TransformerLanguageModel, self).__init__()
+        args = get_args()
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.hidden_size = args.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method = init_method
+        self.encoder_attn_mask_type = encoder_attn_mask_type
+        self.add_decoder = add_decoder
+        self.decoder_attn_mask_type = decoder_attn_mask_type
+        self.add_pooler = add_pooler
+
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = Embedding(self.hidden_size,
+                                       args.padded_vocab_size,
+                                       args.hidden_dropout,
+                                       self.init_method,
+                                       self.num_tokentypes)
+            self._embedding_key = 'embedding'
+
+        # Transformer.
+        self.encoder = ParallelTransformer(
+            self.init_method,
+            output_layer_init_method,
+            self_attn_mask_type=self.encoder_attn_mask_type,
+            pre_process=self.pre_process,
+            post_process=self.post_process
+        )
+        self._encoder_key = 'encoder'
+
+        # Decoder
+        if self.add_decoder:
+            assert args.pipeline_model_parallel_size == 1, \
+                'pipeline parallelism is not supported in the presence of decoder'
+            self.decoder = ParallelTransformer(
+                self.init_method,
+                output_layer_init_method,
+                layer_type=LayerType.decoder,
+                self_attn_mask_type=self.decoder_attn_mask_type)
+            self._decoder_key = 'decoder'
+
+        if self.post_process:
+            # Pooler.
+            if self.add_pooler:
+                self.pooler = Pooler(self.hidden_size, self.init_method)
+                self._pooler_key = 'pooler'
+
+    def set_input_tensor(self, input_tensor):
+        """ See megatron.model.transformer.set_input_tensor()"""
+        self.encoder.set_input_tensor(input_tensor)
+
+    def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
+                dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
+                enc_dec_attn_mask=None, tokentype_ids=None, layer_past=None,
+                get_key_value=False, pooling_sequence_index=0,
+                enc_hidden_states=None, output_enc_hidden=False):
+
+        # Embeddings.
+        if self.pre_process:
+            embedding_output = self.embedding(enc_input_ids, enc_position_ids,
+                                              tokentype_ids=tokentype_ids)
+            encoder_input = embedding_output
+        else:
+            encoder_input = None
+
+        # encoder.
+        if enc_hidden_states is None:
+            encoder_output = self.encoder(encoder_input,
+                                          enc_attn_mask,
+                                          layer_past=layer_past,
+                                          get_key_value=get_key_value)
+        else:
+            encoder_output = enc_hidden_states.to(encoder_input.dtype)
+
+        if self.post_process:
+            if self.add_pooler:
+                pooled_output = self.pooler(encoder_output,
+                                            pooling_sequence_index)
+
+        # output_enc_hidden refers to when we just need the encoder's
+        # output. For example, it is helpful to compute
+        # similarity between two sequences by average pooling
+        if not self.add_decoder or output_enc_hidden:
+            if self.add_pooler and self.post_process:
+                return encoder_output, pooled_output
+            else:
+                return encoder_output
+
+        # Decoder Embedding
+        dec_embedding_output = self.embedding(dec_input_ids,
+                                              dec_position_ids)
+        # decoder
+        decoder_output = self.decoder(dec_embedding_output,
+                                      dec_attn_mask,
+                                      layer_past=layer_past,
+                                      get_key_value=get_key_value,
+                                      encoder_output=encoder_output,
+                                      enc_dec_attn_mask=enc_dec_attn_mask)
+
+        if self.add_pooler and self.post_process:
+            return decoder_output, encoder_output, pooled_output
+        else:
+            return decoder_output, encoder_output
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        if self.pre_process:
+            state_dict_[self._embedding_key] \
+                = self.embedding.state_dict_for_save_checkpoint(
+                    destination, prefix, keep_vars)
+        state_dict_[self._encoder_key] \
+            = self.encoder.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.post_process:
+            if self.add_pooler:
+                state_dict_[self._pooler_key] \
+                    = self.pooler.state_dict_for_save_checkpoint(
+                        destination, prefix, keep_vars)
+        if self.add_decoder:
+            state_dict_[self._decoder_key] \
+                = self.decoder.state_dict_for_save_checkpoint(
+                    destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Embedding.
+        if self.pre_process:
+            if self._embedding_key in state_dict:
+                state_dict_ = state_dict[self._embedding_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if '_embeddings' in key:
+                        state_dict_[key] = state_dict[key]
+            self.embedding.load_state_dict(state_dict_, strict=strict)
+
+        # Encoder.
+        if self._encoder_key in state_dict:
+            state_dict_ = state_dict[self._encoder_key]
+        # for backward compatibility.
+        elif 'transformer' in state_dict:
+            state_dict_ = state_dict['transformer']
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'transformer.' in key:
+                    state_dict_[key.split('transformer.')[1]] = state_dict[key]
+
+        # for backward compatibility.
+        state_dict_self_attention = {}
+        for key in state_dict_.keys():
+            if '.attention.' in key:
+                state_dict_self_attention[key.replace(".attention.",
+                    ".self_attention.")] = state_dict_[key]
+            else:
+                state_dict_self_attention[key] = state_dict_[key]
+        state_dict_ = state_dict_self_attention
+
+        self.encoder.load_state_dict(state_dict_, strict=strict)
+
+        if self.post_process:
+            # pooler
+            if self.add_pooler:
+                assert 'pooler' in state_dict, \
+                    'could not find data for pooler in the checkpoint'
+                self.pooler.load_state_dict(state_dict[self._pooler_key],
+                                            strict=strict)
+        # decoder
+        if self.add_decoder:
+            assert 'decoder' in state_dict, \
+                'could not find data for pooler in the checkpoint'
+            self.decoder.load_state_dict(state_dict[self._decoder_key],
+                                         strict=strict)
--- a/megatron/model/module.py
+++ b/megatron/model/module.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron Module"""
+
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from megatron import get_args
+from megatron import mpu
+
+
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+
+
+
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+
+
+
+class MegatronModule(torch.nn.Module):
+    """Megatron specific extensions of torch Module with support
+    for pipelining."""
+
+    def __init__(self, share_word_embeddings=True):
+        super(MegatronModule, self).__init__()
+        self.share_word_embeddings = share_word_embeddings
+
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """Use this function to override the state dict for
+        saving checkpoints."""
+        return self.state_dict(destination, prefix, keep_vars)
+
+
+    def word_embeddings_weight(self):
+        if mpu.is_pipeline_first_stage(ignore_virtual=True):
+            return self.language_model.embedding.word_embeddings.weight
+        if mpu.is_pipeline_last_stage(ignore_virtual=True):
+            if not self.share_word_embeddings:
+                raise Exception('word_embeddings_weight() called for last '
+                                'stage, but share_word_embeddings is false')
+            return self.word_embeddings.weight
+        raise Exception('word_embeddings_weight() should be '
+                        'called for first and last stage only')
+
+
+    def initialize_word_embeddings(self, init_method_normal):
+        args = get_args()
+        if not self.share_word_embeddings:
+            raise Exception('initialize_word_embeddings() was called but '
+                            'share_word_embeddings is false')
+
+        # This function just initializes the word embeddings in the final stage
+        # when we are using pipeline parallelism. If we aren't using pipeline
+        # parallelism there is nothing to do.
+        if args.pipeline_model_parallel_size == 1:
+            return
+
+        # Parameters are shared between the word embeddings layer, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+        if mpu.is_pipeline_last_stage():
+            assert not mpu.is_pipeline_first_stage()
+            self._word_embeddings_for_head_key = 'word_embeddings_for_head'
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.word_embeddings = mpu.VocabParallelEmbedding(
+                args.padded_vocab_size, args.hidden_size,
+                init_method=init_method_normal(args.init_method_std))
+            self.word_embeddings.weight.data.fill_(0)
+            self.word_embeddings.weight.shared = True
+
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if torch.distributed.is_initialized():
+            if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage():
+                torch.distributed.all_reduce(self.word_embeddings_weight().data,
+                                             group=mpu.get_embedding_group())
+        else:
+            print("WARNING! Distributed processes aren't initialized, so "
+                  "word embeddings in the last layer are not initialized. "
+                  "If you are just manipulating a model this is fine, but "
+                  "this needs to be handled manually. If you are training "
+                  "something is definitely wrong.")
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_float16(val, float16_convertor):
+    """Convert fp32 `val` to fp16/bf16"""
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = float16_convertor(val)
+        return val
+    return conversion_helper(val, half_conversion)
+
+
+def float16_to_fp32(val):
+    """Convert fp16/bf16 `val` to fp32"""
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
+            val = val.float()
+        return val
+    return conversion_helper(val, float_conversion)
+
+
+
+class Float16Module(MegatronModule):
+
+    def __init__(self, module, args):
+        super(Float16Module, self).__init__()
+
+        if args.fp16:
+            self.add_module('module', module.half())
+            def float16_convertor(val):
+                return val.half()
+        elif args.bf16:
+            self.add_module('module', module.bfloat16())
+            def float16_convertor(val):
+                return val.bfloat16()
+        else:
+            raise Exception('should not be here')
+
+        self.float16_convertor = float16_convertor
+
+
+    def forward(self, *inputs, **kwargs):
+        if mpu.is_pipeline_first_stage():
+            inputs = fp32_to_float16(inputs, self.float16_convertor)
+        outputs = self.module(*inputs, **kwargs)
+        if mpu.is_pipeline_last_stage():
+            outputs = float16_to_fp32(outputs)
+        return outputs
+
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(destination, prefix,
+                                                          keep_vars)
+
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multiple choice model."""
+
+import torch
+
+from megatron import get_args, print_rank_last
+from megatron import mpu
+from megatron.enums import AttnMaskType
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+
+
+class MultipleChoice(MegatronModule):
+
+    def __init__(self,
+                 num_tokentypes=2,
+                 pre_process=True,
+                 post_process=True):
+        super(MultipleChoice, self).__init__(share_word_embeddings=False)
+        args = get_args()
+
+        init_method = init_method_normal(args.init_method_std)
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers),
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+
+        # Multi-choice head.
+        if self.post_process:
+            self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
+            self.multichoice_head = get_linear_layer(args.hidden_size, 1,
+                                                     init_method)
+            self._multichoice_head_key = 'multichoice_head'
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(self, model_input, attention_mask, tokentype_ids=None):
+
+        # [batch, choices, sequence] --> [batch * choices, sequence] -->
+        #    transformer --> [batch, choices] --> softmax
+
+        # Ensure the shape is [batch-size, choices, sequence]
+        assert len(attention_mask.shape) == 3
+        num_choices = attention_mask.shape[1]
+
+        # Reshape and treat choice dimension the same as batch.
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+
+        input_ids = model_input
+        # Do the same as attention_mask for input_ids, tokentype_ids
+        assert len(input_ids.shape) == 3
+        assert len(tokentype_ids.shape) == 3
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1))
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids
+        )
+        if self.post_process:
+            _, pooled_output = lm_output
+            multichoice_output = self.multichoice_dropout(pooled_output)
+            multichoice_logits = self.multichoice_head(multichoice_output)
+
+            # Reshape back to separate choices.
+            multichoice_logits = multichoice_logits.view(-1, num_choices)
+
+            return multichoice_logits
+        return lm_output
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.post_process:
+            state_dict_[self._multichoice_head_key] \
+                = self.multichoice_head.state_dict(
+                    destination, prefix, keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process:
+            if self._multichoice_head_key in state_dict:
+                self.multichoice_head.load_state_dict(
+                    state_dict[self._multichoice_head_key], strict=strict)
+            else:
+                print_rank_last('***WARNING*** could not find {} in the checkpoint, '
+                                'initializing to random'.format(
+                                    self._multichoice_head_key))
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
+# Extracted from: https://github.com/EleutherAI/gpt-neox
+import torch
+
+
+class RotaryEmbedding(torch.nn.Module):
+
+    def __init__(self, dim, base=10000, precision=torch.half):
+        super().__init__()
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+        self.max_seq_len_cached = None
+        self.cos_cached = None
+        self.sin_cached = None
+        self.precision = precision
+
+    def forward(self, x, seq_dim=1, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[seq_dim]
+        if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            if self.precision == torch.bfloat16:
+                emb = emb.float()
+            # [sx, 1 (b * np), hn]
+            self.cos_cached = emb.cos()[:, None, :]
+            self.sin_cached = emb.sin()[:, None, :]
+            if self.precision == torch.bfloat16:
+                self.cos_cached = self.cos_cached.bfloat16()
+                self.sin_cached = self.sin_cached.bfloat16()
+        return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
+
+
+# rotary pos emb helpers:
+
+def rotate_half(x):
+    x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
+    cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...]
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+
+
+def apply_rotary_pos_emb_torch(q, k, cos, sin, offset: int = 0):  # jitting fails with bf16
+    cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...]
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
\ No newline at end of file
--- a/megatron/model/realm_model.py
+++ b/megatron/model/realm_model.py
+import os
+import torch
+
+from megatron import get_args, print_rank_0
+from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
+from megatron.model.bert_model import BertModel
+from .module import MegatronModule
+from megatron import mpu
+from megatron.enums import AttnMaskType
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import scaled_init_method_normal
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+
+
+def general_ict_model_provider(only_query_model=False, only_block_model=False):
+    """Build the model."""
+    args = get_args()
+    assert args.ict_head_size is not None, \
+        "Need to specify --ict-head-size to provide an ICTBertModel"
+    assert mpu.get_tensor_model_parallel_world_size() == 1 and mpu.get_pipeline_model_parallel_world_size() == 1, \
+        "Model parallel size > 1 not supported for ICT"
+
+    print_rank_0('building ICTBertModel...')
+
+    # simpler to just keep using 2 tokentypes since the LM we initialize with has 2 tokentypes
+    model = ICTBertModel(
+        ict_head_size=args.ict_head_size,
+        num_tokentypes=2,
+        parallel_output=True,
+        only_query_model=only_query_model,
+        only_block_model=only_block_model)
+
+    return model
+
+
+class ICTBertModel(MegatronModule):
+    """Bert-based module for Inverse Cloze task."""
+    def __init__(self,
+                 ict_head_size,
+                 num_tokentypes=1,
+                 parallel_output=True,
+                 only_query_model=False,
+                 only_block_model=False):
+        super(ICTBertModel, self).__init__()
+        bert_kwargs = dict(
+            ict_head_size=ict_head_size,
+            num_tokentypes=num_tokentypes,
+            parallel_output=parallel_output
+        )
+        assert not (only_block_model and only_query_model)
+        self.use_block_model = not only_query_model
+        self.use_query_model = not only_block_model
+
+        if self.use_query_model:
+            # this model embeds (pseudo-)queries - Embed_input in the paper
+            self.query_model = IREncoderBertModel(**bert_kwargs)
+            self._query_key = 'question_model'
+
+        if self.use_block_model:
+            # this model embeds evidence blocks - Embed_doc in the paper
+            self.block_model = IREncoderBertModel(**bert_kwargs)
+            self._block_key = 'context_model'
+
+    def forward(self, query_tokens, query_attention_mask, block_tokens, block_attention_mask):
+        """Run a forward pass for each of the models and return the respective embeddings."""
+        query_logits = self.embed_query(query_tokens, query_attention_mask)
+        block_logits = self.embed_block(block_tokens, block_attention_mask)
+        return query_logits, block_logits
+
+    def embed_query(self, query_tokens, query_attention_mask):
+        """Embed a batch of tokens using the query model"""
+        if self.use_query_model:
+            query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0)
+            query_ict_logits, _ = self.query_model.forward(query_tokens, query_attention_mask, query_types)
+            return query_ict_logits
+        else:
+            raise ValueError("Cannot embed query without query model.")
+
+    def embed_block(self, block_tokens, block_attention_mask):
+        """Embed a batch of tokens using the block model"""
+        if self.use_block_model:
+            block_types = torch.cuda.LongTensor(*block_tokens.shape).fill_(0)
+            block_ict_logits, _ = self.block_model.forward(block_tokens, block_attention_mask, block_types)
+            return block_ict_logits
+        else:
+            raise ValueError("Cannot embed block without block model.")
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """Save dict with state dicts of each of the models."""
+        state_dict_ = {}
+        if self.use_query_model:
+            state_dict_[self._query_key] \
+                = self.query_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+
+        if self.use_block_model:
+            state_dict_[self._block_key] \
+                = self.block_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Load the state dicts of each of the models"""
+        if self.use_query_model:
+            print("Loading ICT query model", flush=True)
+            self.query_model.load_state_dict(
+                state_dict[self._query_key], strict=strict)
+
+        if self.use_block_model:
+            print("Loading ICT block model", flush=True)
+            self.block_model.load_state_dict(
+                state_dict[self._block_key], strict=strict)
+
+    def init_state_dict_from_bert(self):
+        """Initialize the state from a pretrained BERT model on iteration zero of ICT pretraining"""
+        args = get_args()
+        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
+        if not os.path.isfile(tracker_filename):
+            raise FileNotFoundError("Could not find BERT load for ICT")
+        with open(tracker_filename, 'r') as f:
+            iteration = int(f.read().strip())
+            assert iteration > 0
+
+        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except BaseException:
+            raise ValueError("Could not load checkpoint")
+
+        # load the LM state dict into each model
+        model_dict = state_dict['model']['language_model']
+        self.query_model.language_model.load_state_dict(model_dict)
+        self.block_model.language_model.load_state_dict(model_dict)
+
+        # give each model the same ict_head to begin with as well
+        query_ict_head_state_dict = self.state_dict_for_save_checkpoint()[self._query_key]['ict_head']
+        self.block_model.ict_head.load_state_dict(query_ict_head_state_dict)
+
+
+class IREncoderBertModel(MegatronModule):
+    """BERT-based encoder for queries or blocks used for learned information retrieval."""
+    def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True):
+        super(IREncoderBertModel, self).__init__()
+        args = get_args()
+
+        self.ict_head_size = ict_head_size
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+
+        self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method)
+        self._ict_head_key = 'ict_head'
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output, pooled_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids)
+
+        # Output.
+        ict_logits = self.ict_head(pooled_output)
+        return ict_logits, None
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        state_dict_[self._ict_head_key] \
+            = self.ict_head.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        self.ict_head.load_state_dict(
+            state_dict[self._ict_head_key], strict=strict)
+
+
--- a/megatron/model/t5_model.py
+++ b/megatron/model/t5_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""T5 model."""
+
+import torch
+
+from megatron import (
+    get_args,
+    mpu
+)
+from megatron.enums import AttnMaskType
+from megatron.model.language_model import parallel_lm_logits, get_language_model
+from megatron.model.transformer import LayerNorm
+from megatron.model.utils import (
+    openai_gelu,
+    get_linear_layer,
+    init_method_normal,
+    scaled_init_method_normal
+)
+from .module import MegatronModule
+
+
+def t5_extended_attention_mask(attention_mask_list):
+
+    def attn_mask_postprocess(attn_mask):
+        # [b, 1, s, s]
+        extended_attention_mask = attn_mask.unsqueeze(1)
+        return extended_attention_mask
+
+    return [attn_mask_postprocess(attn_mask) for attn_mask in attention_mask_list]
+
+
+def t5_position_ids(token_ids):
+    # Create position ids
+    seq_length = token_ids.size(1)
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=token_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+
+    return position_ids
+
+
+class T5LMHead(MegatronModule):
+    """Masked LM head for T5
+
+    Arguments:
+        mpu_vocab_size: model parallel size of vocabulary.
+        hidden_size: hidden size
+        init_method: init method for weight initialization
+        layernorm_epsilon: tolerance for layer norm divisions
+        parallel_output: wether output logits being distributed or not.
+    """
+
+    def __init__(self, mpu_vocab_size, parallel_output):
+        super(T5LMHead, self).__init__()
+
+        args = get_args()
+
+        self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
+        self.bias.model_parallel = True
+        self.bias.partition_dim = 0
+        self.bias.stride = 1
+        self.parallel_output = parallel_output
+
+    def forward(self, hidden_states, word_embeddings_weight):
+        output = parallel_lm_logits(hidden_states,
+                                    word_embeddings_weight,
+                                    self.parallel_output,
+                                    bias=self.bias)
+        return output
+
+
+class T5Model(MegatronModule):
+    """T5 Language model."""
+
+    def __init__(self, num_tokentypes=0, parallel_output=True):
+        super(T5Model, self).__init__()
+        args = get_args()
+
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            add_decoder=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+
+        self.lm_head = T5LMHead(
+            self.language_model.embedding.word_embeddings.weight.size(0),
+            parallel_output)
+        self._lm_head_key = 'lm_head'
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_mask,
+                decoder_attn_mask, encoder_decoder_attn_mask,
+                tokentype_ids=None, lm_labels=None, enc_hidden_states=None):
+
+        # Converting the attention masks to proper parameter settings
+        encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask = t5_extended_attention_mask(
+            [encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask])
+
+        encoder_position_ids = t5_position_ids(encoder_input_ids)
+        decoder_position_ids = t5_position_ids(decoder_input_ids)
+
+        lm_output = self.language_model(encoder_input_ids,
+                                        encoder_position_ids,
+                                        encoder_attn_mask,
+                                        decoder_input_ids,
+                                        decoder_position_ids,
+                                        decoder_attn_mask,
+                                        encoder_decoder_attn_mask,
+                                        tokentype_ids=tokentype_ids,
+                                        enc_hidden_states=enc_hidden_states)
+
+        decoder_output, encoder_output = lm_output
+
+        # Output.
+        lm_logits = self.lm_head(decoder_output,
+                                 self.language_model.embedding.word_embeddings.weight)
+
+        if lm_labels is None:
+            return lm_logits, encoder_output
+        else:
+            if self.fp16_lm_cross_entropy:
+                assert lm_logits.dtype == torch.half
+                lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
+            else:
+                lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
+                                                           lm_labels)
+            return lm_loss, encoder_output
+
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        state_dict_[self._lm_head_key] \
+            = self.lm_head.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        self.lm_head.load_state_dict(state_dict[self._lm_head_key],
+                                     strict=strict)
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer."""
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from megatron import get_args, logging
+from megatron import mpu
+from .module import MegatronModule
+from megatron.enums import AttnMaskType, LayerType, AttnType, PositionEmbeddingType
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.model.fused_bias_gelu import bias_gelu_impl
+from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
+
+import deepspeed
+
+from .glu_activations import GLU_ACTIVATIONS
+from .positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb_torch, apply_rotary_pos_emb
+
+# flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+""" We use the following notation throughout this file:
+     h: hidden size
+     n: number of attention heads
+     p: number of model parallel partitions
+     np: n/p
+     hp: h/p
+     hn: h/n
+     b: batch size
+     s: sequence length
+     l: number of layers
+    Transformer takes input of size [s, b, h] and returns a
+    tensor of the same size. We use the following arguments:
+        hyperparameters: transformer hyperparameters
+"""
+
+class ParallelMLP(MegatronModule):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
+    """
+
+    def __init__(self, init_method, output_layer_init_method):
+        super(ParallelMLP, self).__init__()
+        args = get_args()
+
+        # Project to ffn_hidden_size
+        self.dense_h_to_4h = mpu.ColumnParallelLinear(
+            args.hidden_size,
+            # GLU is a special activation that divides the dimension by a factor 2.
+            2 * args.ffn_hidden_size if args.glu_activation else args.ffn_hidden_size,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True)
+
+        self.bias_gelu_fusion = args.bias_gelu_fusion
+        self.activation_func = F.gelu
+        if args.glu_activation:
+            self.activation_func = GLU_ACTIVATIONS[args.glu_activation]
+        elif args.openai_gelu:
+            self.activation_func = openai_gelu
+        elif args.onnx_safe:
+            self.activation_func = erf_gelu
+
+        # Project back to h.
+        self.dense_4h_to_h = mpu.RowParallelLinear(
+            args.ffn_hidden_size,
+            args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+
+
+    def forward(self, hidden_states):
+
+        # [s, b, 4hp]
+        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
+
+        if self.bias_gelu_fusion:
+             intermediate_parallel = \
+                     bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            intermediate_parallel = \
+                self.activation_func(intermediate_parallel + bias_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        return output, output_bias
+
+
+class ParallelAttention(MegatronModule):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [b, s, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, init_method,
+                 output_layer_init_method, layer_number,
+                 attention_type=AttnType.self_attn,
+                 attn_mask_type=AttnMaskType.padding):
+        super(ParallelAttention, self).__init__()
+        args = get_args()
+        self.fp16 = args.fp16
+        self.bf16 = args.bf16
+        self.position_embedding_type = args.position_embedding_type
+
+        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.attention_type = attention_type
+        self.attn_mask_type = attn_mask_type
+
+        projection_size = args.kv_channels * args.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(projection_size,
+                                                    world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            projection_size, args.num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
+            args.num_attention_heads, world_size)
+
+        # Strided linear layer.
+        if attention_type == AttnType.self_attn:
+            self.query_key_value = mpu.ColumnParallelLinear(
+                args.hidden_size,
+                3 * projection_size,
+                gather_output=False,
+                init_method=init_method)
+        else:
+            assert attention_type == AttnType.cross_attn
+            self.query = mpu.ColumnParallelLinear(
+                args.hidden_size,
+                projection_size,
+                gather_output=False,
+                init_method=init_method)
+
+            self.key_value = mpu.ColumnParallelLinear(
+                args.hidden_size,
+                2 * projection_size,
+                gather_output=False,
+                init_method=init_method)
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            self.fp16, self.bf16,
+            self.attn_mask_type,
+            args.masked_softmax_fusion,
+            attention_mask_func,
+            self.attention_softmax_in_fp32,
+            coeff)
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
+
+        # Output.
+        self.dense = mpu.RowParallelLinear(
+            projection_size,
+            args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+        if self.position_embedding_type == PositionEmbeddingType.rotary:
+            self.rotary_emb = RotaryEmbedding(self.hidden_size_per_attention_head, precision=args.params_dtype)
+
+    def forward(self, hidden_states, attention_mask, layer_past=None,
+                get_key_value=False, encoder_output=None, alibi=None):
+        # hidden_states: [sq, b, h]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        if self.attention_type == AttnType.self_attn:
+            # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+            mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+            # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer,
+             key_layer,
+             value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
+        else:
+            # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
+            mixed_kv_layer, _ = self.key_value(encoder_output)
+
+            # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+            new_tensor_shape = mixed_kv_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 2 * self.hidden_size_per_attention_head)
+            mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
+
+            # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+            (key_layer,
+             value_layer) = mpu.split_tensor_along_last_dim(mixed_kv_layer, 2)
+
+            # Attention head [sq, b, h] --> [sq, b, hp]
+            query_layer, _ = self.query(hidden_states)
+            # [sq, b, hp] --> [sq, b, np, hn]
+            new_tensor_shape = query_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 self.hidden_size_per_attention_head)
+            query_layer = query_layer.view(*new_tensor_shape)
+
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key_layer = torch.cat((past_key.type_as(key_layer),
+                                   key_layer), dim=0)
+            value_layer = torch.cat((past_value.type_as(value_layer),
+                                     value_layer), dim=0)
+        if get_key_value:
+            present = (key_layer, value_layer)
+
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1),
+                       query_layer.size(2),
+                       query_layer.size(0),
+                       key_layer.size(0))
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(output_size[2],
+                                       output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
+
+        # preallocting result tensor: [b * np, sq, sk]
+        if alibi is None:
+            matmul_result = torch.empty(
+                output_size[0]*output_size[1],
+                output_size[2],
+                output_size[3],
+                dtype=query_layer.dtype,
+                device=torch.cuda.current_device())
+        else:
+            matmul_result = alibi[:output_size[0]*output_size[1], :, :output_size[3]]
+
+        # Rotary embeddings
+        if self.position_embedding_type == PositionEmbeddingType.rotary:
+            apply_rotary_fn = apply_rotary_pos_emb_torch if self.bf16 else apply_rotary_pos_emb
+
+            seq_len = key_layer.shape[0]
+            offset = 0
+            if layer_past is not None and layer_past.numel() > 0:
+                offset = layer_past[0].shape[0]
+                seq_len += offset
+            cos, sin = self.rotary_emb(value_layer, seq_len=seq_len)
+            query_layer, key_layer = apply_rotary_fn(query_layer, key_layer, cos, sin, offset=offset)
+
+        # Raw attention scores. [b * np, sq, sk]
+        if alibi is None:
+            matmul_result = torch.baddbmm(
+                matmul_result,
+                query_layer.transpose(0, 1),   # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0, alpha=(1.0/self.norm_factor))
+        else:
+            if not hasattr(self, "logged_alibi"):
+                logger.debug("Using Alibi.")
+                self.logged_alibi = True
+
+            if self.apply_query_key_layer_scaling:
+                beta = 1.0 / self.layer_number
+            else:
+                beta = 1.0
+
+            matmul_result = torch.baddbmm(
+                matmul_result,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=beta, alpha=(1.0 / self.norm_factor))
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+        # ==================================================
+        # Update attention mask for inference. [b, np, sq, sk]
+        # ==================================================
+
+        if get_key_value:
+            with torch.no_grad():
+                # TODO @thomasw21 Handle case where `attention_mask` is None
+                if layer_past is not None:
+                    attention_mask = attention_mask[
+                        ...,
+                        attention_scores.size(3) - 1,
+                        :attention_scores.size(3)].unsqueeze(2)
+                else:
+                    attention_mask = attention_mask[
+                        ...,
+                        :attention_scores.size(3),
+                        :attention_scores.size(3)]
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with mpu.get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1),
+                       value_layer.size(2),
+                       query_layer.size(0),
+                       value_layer.size(3))
+
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(value_layer.size(0),
+                                       output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.dense(context_layer)
+
+        if get_key_value:
+            output = [output, present]
+
+        return output, bias
+
+
+def bias_dropout_add(x, bias, residual, prob, training):
+    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
+    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+def get_bias_dropout_add(training):
+    def _bias_dropout_add(x, bias, residual, prob):
+        return bias_dropout_add(x, bias, residual, prob, training)
+    return _bias_dropout_add
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(x, bias, residual, prob):
+    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    return bias_dropout_add(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(x, bias, residual, prob):
+    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    return bias_dropout_add(x, bias, residual, prob, False)
+
+
+class ParallelTransformerLayer(MegatronModule):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [b, s, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, init_method, output_layer_init_method,
+                 layer_number, layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.padding):
+        args = get_args()
+
+        super(ParallelTransformerLayer, self).__init__()
+        self.layer_number = layer_number
+        self.layer_type = layer_type
+
+        self.apply_residual_connection_post_layernorm \
+            = args.apply_residual_connection_post_layernorm
+
+        self.bf16 = args.bf16
+        self.fp32_residual_connection = args.fp32_residual_connection
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(
+            args.hidden_size,
+            eps=args.layernorm_epsilon)
+
+        # Self attention.
+        self.self_attention = ParallelAttention(
+            init_method,
+            output_layer_init_method,
+            layer_number,
+            attention_type=AttnType.self_attn,
+            attn_mask_type=self_attn_mask_type)
+        self.hidden_dropout = args.hidden_dropout
+        self.bias_dropout_fusion = args.bias_dropout_fusion
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNorm(
+            args.hidden_size,
+            eps=args.layernorm_epsilon)
+
+        if self.layer_type == LayerType.decoder:
+            self.inter_attention = ParallelAttention(
+                init_method,
+                output_layer_init_method,
+                layer_number,
+                attention_type=AttnType.cross_attn)
+            # Layernorm on the attention output.
+            self.post_inter_attention_layernorm = LayerNorm(
+                args.hidden_size,
+                eps=args.layernorm_epsilon)
+
+        # MLP
+        self.mlp = ParallelMLP(init_method,
+                               output_layer_init_method)
+
+        # Alibi
+        if args.position_embedding_type == PositionEmbeddingType.alibi:
+            self.alibi = self._build_alibi_tensor(args.seq_length, args.num_attention_heads, args.micro_batch_size).to(torch.cuda.current_device())
+            if args.params_dtype == torch.float16:
+                self.alibi = self.alibi.to(torch.float16)
+            elif args.params_dtype == torch.bfloat16:
+                self.alibi = self.alibi.to(torch.bfloat16)
+        else:
+            self.alibi = None
+
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, enc_dec_attn_mask=None,
+                layer_past=None, get_key_value=False):
+        # hidden_states: [b, s, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, attention_bias = \
+            self.self_attention(layernorm_output,
+                                attention_mask,
+                                layer_past=layer_past,
+                                get_key_value=get_key_value,
+                                alibi=self.alibi)
+
+        if get_key_value:
+            attention_output, presents = attention_output
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # jit scripting for a nn.module (with dropout) is not
+        # trigerring the fusion kernel. For now, we use two
+        # different nn.functional routines to account for varying
+        # dropout semantics during training and inference phases.
+        if self.bias_dropout_fusion:
+            if self.training:
+                bias_dropout_add_func = bias_dropout_add_fused_train
+            else:
+                bias_dropout_add_func = bias_dropout_add_fused_inference
+        else:
+            bias_dropout_add_func = get_bias_dropout_add(self.training)
+
+        # re-enable torch grad to enable fused optimization.
+        with torch.enable_grad():
+            layernorm_input = bias_dropout_add_func(
+                attention_output,
+                attention_bias.expand_as(residual),
+                residual,
+                self.hidden_dropout)
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        if self.layer_type == LayerType.decoder:
+            attention_output, attention_bias = \
+                self.inter_attention(layernorm_output,
+                                     enc_dec_attn_mask,
+                                     encoder_output=encoder_output)
+            # residual connection
+            if self.apply_residual_connection_post_layernorm:
+                residual = layernorm_output
+            else:
+                residual = layernorm_input
+
+            # re-enable torch grad to enable fused optimization.
+            with torch.enable_grad():
+                layernorm_input = bias_dropout_add_func(
+                    attention_output,
+                    attention_bias.expand_as(residual),
+                    residual,
+                    self.hidden_dropout)
+
+            # Layer norm post the decoder attention
+            layernorm_output = self.post_inter_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output, mlp_bias = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        # re-enable torch grad to enable fused optimization.
+        with torch.enable_grad():
+            output = bias_dropout_add_func(
+                mlp_output,
+                mlp_bias.expand_as(residual),
+                residual,
+                self.hidden_dropout)
+
+        if get_key_value:
+            output = [output, presents]
+
+        return output
+
+    @staticmethod
+    def _build_alibi_tensor(max_seq_len, num_attention_heads, batch_size):
+        # Based on https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+        """Returns tensor shaped (batch_size * num_attention_heads, 1, max_seq_len)"""
+
+        def get_slopes(n):
+            def get_slopes_power_of_2(n):
+                start = (2 ** (-2 ** -(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio ** i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2 ** math.floor(math.log2(n))
+                return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
+                                                                   :n - closest_power_of_2]
+
+        slopes = torch.Tensor(get_slopes(num_attention_heads))
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand(
+            num_attention_heads, -1, -1)
+        
+        #Select the part of the tensor that corresponds to our tensor parallel index.
+        tp_world_size = mpu.get_tensor_model_parallel_world_size()
+        tp_index = mpu.get_tensor_model_parallel_rank()
+        alibi = alibi.reshape((tp_world_size, -1, *alibi.shape[1:]))[tp_index]
+        
+        alibi = alibi.repeat(batch_size, 1, 1)
+        return alibi
+
+class ParallelTransformerLayerPipe(ParallelTransformerLayer):
+    """Extends ParallelTransformerLayer to forward attention_mask through the pipeline.
+
+    Forward has two usages that affect attention mask communication:
+
+    1) forward((input, attn_mask) , **kwargs) -> (output, mask)
+       When the attention mask is provided as the second positional
+       argument, typical pipeline behavior is used and both the output
+       *and* mask are returned in a tuple. This tuple is then forwarded
+       to the next stage in the pipeline.
+
+       This version is useful if masks are dynamic.
+    
+    2) forward(input, **kwargs) -> output
+       When the mask is static over all samples, it is advantageous to
+       cache the mask and avoid communicating it.
+    """
+    def forward(self, inputs, **kwargs):
+        assert torch.is_tensor(inputs) or isinstance(inputs, tuple)
+        if torch.is_tensor(inputs) or len(inputs) == 1:
+            hidden_states, attention_mask = inputs, None
+            return super().forward(hidden_states, attention_mask, **kwargs)
+        elif len(inputs) == 2:
+            # Attention mask is an activation.
+            hidden_states, attention_mask = inputs[0], inputs[1]
+            return super().forward(*inputs, **kwargs), attention_mask
+        else:
+            raise RuntimeError('Received more inputs than understood.')
+
+
+class ParallelTransformer(MegatronModule):
+    """Transformer class."""
+
+    def __init__(self, init_method, output_layer_init_method,
+                 layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.padding,
+                 pre_process=True, post_process=True):
+        super(ParallelTransformer, self).__init__()
+        args = get_args()
+
+        self.bf16 = args.bf16
+        self.fp32_residual_connection = args.fp32_residual_connection
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.input_tensor = None
+
+        # Store activation checkpoiting flag.
+        self.checkpoint_activations = args.checkpoint_activations
+        self.checkpoint_num_layers = args.checkpoint_num_layers
+
+        # Number of layers.
+        assert args.num_layers % mpu.get_pipeline_model_parallel_world_size() == 0, \
+            'num_layers must be divisible by pipeline_model_parallel_size'
+        self.num_layers = args.num_layers // mpu.get_pipeline_model_parallel_world_size()
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return ParallelTransformerLayer(
+                init_method,
+                output_layer_init_method,
+                layer_number,
+                layer_type=layer_type,
+                self_attn_mask_type=self_attn_mask_type)
+        if args.virtual_pipeline_model_parallel_size is not None:
+            assert args.num_layers % args.virtual_pipeline_model_parallel_size == 0, \
+                'num_layers_per_stage must be divisible by ' \
+                'virtual_pipeline_model_parallel_size'
+            # Number of layers in each model chunk is the number of layers in the stage,
+            # divided by the number of model chunks in a stage.
+            self.num_layers = self.num_layers // args.virtual_pipeline_model_parallel_size
+            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0]  [2]  [4]  [6]
+            # Stage 1: [1]  [3]  [5]  [7]
+            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0, 1]  [4, 5]
+            # Stage 1: [2, 3]  [6, 7]
+            offset = mpu.get_virtual_pipeline_model_parallel_rank() * (
+                args.num_layers // args.virtual_pipeline_model_parallel_size) + \
+                (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
+        else:
+            # Each stage gets a contiguous set of layers.
+            offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers
+
+        self.layers = torch.nn.ModuleList(
+            [build_layer(i + 1 + offset) for i in range(self.num_layers)])
+
+        if self.post_process:
+            # Final layer norm before output.
+            self.final_layernorm = LayerNorm(
+                args.hidden_size,
+                eps=args.layernorm_epsilon)
+
+        if deepspeed.checkpointing.is_configured():
+            global get_cuda_rng_tracker, checkpoint
+            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            checkpoint = deepspeed.checkpointing.checkpoint
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def _checkpointed_forward(self, hidden_states, attention_mask,
+                              encoder_output, enc_dec_attn_mask):
+        """Forward method with activation checkpointing."""
+        def custom(start, end):
+            def custom_forward(*inputs):
+                x_ = inputs[0]
+                attention_mask = inputs[1]
+                encoder_output = inputs[2]
+                enc_dec_attn_mask = inputs[3]
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    x_ = layer(x_, attention_mask, encoder_output, enc_dec_attn_mask)
+                return x_
+            return custom_forward
+
+        # Make sure memory is freed.
+        mpu.reset_checkpointed_activations_memory_buffer()
+        l = 0
+        while l < self.num_layers:
+            hidden_states = mpu.checkpoint(
+                custom(l, l + self.checkpoint_num_layers),
+                hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
+            l += self.checkpoint_num_layers
+
+        return hidden_states
+
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def forward(self, hidden_states, attention_mask, layer_past=None,
+                get_key_value=False, encoder_output=None, enc_dec_attn_mask=None):
+
+        # Checks.
+        if layer_past is not None:
+            assert get_key_value, \
+                'for not None values in layer_past, ' \
+                'expected get_key_value to be set'
+        if get_key_value:
+            assert not self.checkpoint_activations, \
+                'get_key_value does not work with ' \
+                'activation checkpointing'
+
+        if self.pre_process:
+            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+            # If the input flag for fp32 residual connection is set, convert for float.
+            if self.fp32_residual_connection:
+                hidden_states = hidden_states.transpose(0, 1).contiguous().float()
+            # Otherwise, leave it as is.
+            else:
+                hidden_states = hidden_states.transpose(0, 1).contiguous()
+        else:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        if encoder_output is not None:
+             encoder_output = encoder_output.transpose(0, 1).contiguous()
+
+        if self.checkpoint_activations:
+            hidden_states = self._checkpointed_forward(hidden_states,
+                                                       attention_mask,
+                                                       encoder_output,
+                                                       enc_dec_attn_mask)
+        else:
+            if get_key_value:
+                presents = []
+            for index in range(self.num_layers):
+                layer = self._get_layer(index)
+                past = None
+                if layer_past is not None:
+                    past = layer_past[index]
+                hidden_states = layer(hidden_states,
+                                      attention_mask,
+                                      encoder_output=encoder_output,
+                                      enc_dec_attn_mask=enc_dec_attn_mask,
+                                      layer_past=past,
+                                      get_key_value=get_key_value)
+                if get_key_value:
+                    hidden_states, present = hidden_states
+                    presents.append(present)
+
+        # Final layer norm.
+        if self.post_process:
+            # Reverting data format change [s b h] --> [b s h].
+            hidden_states = hidden_states.transpose(0, 1).contiguous()
+            output = self.final_layernorm(hidden_states)
+        else:
+            output = hidden_states
+        if get_key_value:
+            output = [output, presents]
+
+        return output
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for models."""
+
+import math
+from functools import wraps
+
+import torch
+
+from megatron import get_args
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def attention_mask_func(attention_scores, attention_mask):
+    args = get_args()
+    if args.curriculum_learning:
+        attention_mask_ = attention_mask
+        actual_seqlen = attention_scores.size()[2]
+        if actual_seqlen != attention_mask_.size()[2]:
+            # attention_mask has size [1, 1, seqlen, seqlen]
+            attention_mask_ = attention_mask_[:, :, :actual_seqlen, :actual_seqlen].contiguous()
+        attention_scores.masked_fill_(attention_mask_, torch.finfo(attention_scores.dtype).min)
+    else:
+        attention_scores.masked_fill_(attention_mask, torch.finfo(attention_scores.dtype).min)
+    return attention_scores
+
+
+def get_linear_layer(rows, columns, init_method):
+    """Simple linear layer with weight initialization."""
+    layer = torch.nn.Linear(rows, columns)
+    init_method(layer.weight)
+    with torch.no_grad():
+        layer.bias.zero_()
+    return layer
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                       (1.0 + 0.044715 * x * x)))
+def openai_gelu(x):
+    return gelu_impl(x)
+
+#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@torch.jit.script
+def erf_gelu(x):
+    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
+
+def log_debug_usage(logger, msg: str):
+    def log_debug_usage_(func):
+        """Helper function in order to log a message when using a function for the first time"""
+        func.__logged_message__ = False
+
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+            if func.__logged_message__ is False:
+                logger.debug(msg)
+                func.__logged_message__ = True
+            return func(*args, **kwargs)
+
+        return wrapped
+    return log_debug_usage_
--- a/megatron/model/vit_model.py
+++ b/megatron/model/vit_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Vision Transformer(VIT) model."""
+
+import math
+import einops
+import torch
+import torch.nn.functional as F
+from megatron import get_args
+from megatron.model.transformer import ParallelTransformer
+from megatron.model.utils import (
+    get_linear_layer,
+    init_method_normal,
+    scaled_init_method_normal,
+)
+from .module import MegatronModule
+
+
+class VitMlpHead(MegatronModule):
+    """Pooler layer.
+
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+
+    def __init__(self, hidden_size, num_classes):
+        super(VitMlpHead, self).__init__()
+        self.dense_in = torch.nn.Linear(hidden_size, hidden_size)
+        self.dense_out = torch.nn.Linear(hidden_size, num_classes)
+        torch.nn.init.constant_(self.dense_out.bias, -10)
+
+    def forward(self, hidden_states, sequence_index=0):
+        # hidden_states: [b, s, h]
+        # sequence_index: index of the token to pool.
+        x = hidden_states[:, sequence_index, :]
+        x = self.dense_in(x)
+        x = torch.tanh(x)
+        x = self.dense_out(x)
+        return x
+
+
+def twod_interpolate_position_embeddings_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+
+    args = get_args()
+    num_patches_per_dim = args.img_dim // args.patch_dim
+    num_patches = num_patches_per_dim ** 2
+    seq_length = num_patches + 1
+    hidden_size = args.hidden_size
+
+    key = prefix + "weight"
+    # import pdb
+    # pdb.set_trace()
+    assert key in state_dict
+    if key in state_dict:
+        input_param = state_dict[key]
+
+        assert input_param.shape[1] == hidden_size
+        if input_param.shape[0] != seq_length:
+            # update input_param and load it to state_dict[key]
+
+            num_tok_input = input_param.shape[0] - 1
+            num_tok_new = seq_length - 1
+            input_param_tok, input_param_grid = (
+                input_param[:1, :],
+                input_param[1:, :],
+            )
+
+            gs_input = int(math.sqrt(num_tok_input))
+            gs_new = int(math.sqrt(num_tok_new))
+
+            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
+            input_param_grid = input_param_grid.reshape(
+                (1, -1, gs_input, gs_input)
+            )
+            input_param_grid = input_param_grid.float()
+            scale_factor = gs_new / gs_input
+
+            input_param_grid = F.interpolate(
+                input_param_grid, scale_factor=scale_factor, mode="bilinear"
+            )
+
+            input_param_grid = input_param_grid.half()
+            input_param_grid = input_param_grid.reshape((-1, gs_new * gs_new))
+            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
+
+            assert input_param_grid.shape[1] == hidden_size
+            input_param = torch.cat((input_param_tok, input_param_grid), dim=0)
+            assert (
+                input_param.shape[0] == seq_length
+                and input_param.shape[1] == hidden_size
+            )
+
+            state_dict[key] = input_param
+
+
+class VitModel(MegatronModule):
+    """Vision Transformer Model."""
+
+    def __init__(self, num_classes, finetune=False):
+        super(VitModel, self).__init__()
+        args = get_args()
+
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        if args.init_method_xavier_uniform:
+            self.init_method = torch.nn.init.xavier_uniform_
+            self.scaled_init_method = torch.nn.init.xavier_uniform_
+        else:
+            self.init_method = init_method_normal(args.init_method_std)
+            self.scaled_init_method = scaled_init_method_normal(
+                args.init_method_std, args.num_layers
+            )
+
+        self.hidden_size = args.hidden_size
+        self.num_classes = num_classes
+        self.patch_dim = args.patch_dim
+        self.img_dim = args.img_dim
+        self.finetune = finetune
+
+        assert self.img_dim % self.patch_dim == 0
+        self.num_patches_per_dim = self.img_dim // self.patch_dim
+        self.num_patches = self.num_patches_per_dim ** 2
+        self.seq_length = self.num_patches + 1
+        self.flatten_dim = self.patch_dim * self.patch_dim * args.num_channels
+
+        # cls_token
+        self.cls_token = torch.nn.Parameter(torch.randn(1, 1, self.hidden_size))
+        torch.nn.init.zeros_(self.cls_token)
+
+        # Linear encoder
+        self.linear_encoder = torch.nn.Linear(
+            self.flatten_dim, self.hidden_size
+        )
+
+        # embedding
+        self.position_embeddings = torch.nn.Embedding(
+            self.seq_length, self.hidden_size
+        )
+        init_method_normal(args.init_method_std)(
+            self.position_embeddings.weight
+        )
+        self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
+
+        self.position_embeddings._register_load_state_dict_pre_hook(
+            twod_interpolate_position_embeddings_hook
+        )
+
+        self.embedding_dropout = torch.nn.Dropout(args.hidden_dropout)
+
+        # Transformer
+        self.transformer = ParallelTransformer(
+            self.init_method, self.scaled_init_method
+        )
+
+        # MLP head
+        if not self.finetune:
+            self.mlp_head = VitMlpHead(self.hidden_size, self.num_classes)
+        else:
+            self.class_head = get_linear_layer(
+                self.hidden_size, num_classes, torch.nn.init.zeros_
+            )
+
+    def forward(self, x):
+        x = einops.rearrange(
+            x,
+            "b c (h p1) (w p2) -> b (h w) (p1 p2 c)",
+            p1=self.patch_dim,
+            p2=self.patch_dim,
+        )
+
+        assert x.dtype == torch.half
+        x = self.linear_encoder(x)
+        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        x = x + self.position_embeddings(self.position_ids)
+        x = self.embedding_dropout(x)
+        x = self.transformer(x, None)
+
+        if not self.finetune:
+            x = self.mlp_head(x)
+        else:
+            x = self.class_head(x[:, 0, :])
+
+        return x
--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Model parallel utility interface."""
+
+from .cross_entropy import vocab_parallel_cross_entropy
+
+from .data import broadcast_data
+
+from .initialize import is_unitialized
+from .initialize import destroy_model_parallel
+from .initialize import get_data_parallel_group
+from .initialize import get_data_parallel_rank
+from .initialize import get_data_parallel_world_size
+from .initialize import get_embedding_group
+from .initialize import get_model_parallel_group
+from .initialize import get_tensor_model_parallel_group
+from .initialize import get_pipeline_model_parallel_group
+from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank
+from .initialize import get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank
+from .initialize import is_pipeline_first_stage, is_pipeline_last_stage
+from .initialize import get_tensor_model_parallel_src_rank
+from .initialize import get_pipeline_model_parallel_first_rank
+from .initialize import get_pipeline_model_parallel_last_rank
+from .initialize import get_pipeline_model_parallel_next_rank
+from .initialize import get_pipeline_model_parallel_prev_rank
+from .initialize import get_tensor_model_parallel_world_size, set_tensor_model_parallel_world_size
+from .initialize import get_pipeline_model_parallel_world_size, set_pipeline_model_parallel_world_size
+from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank
+from .initialize import initialize_model_parallel
+from .initialize import model_parallel_is_initialized
+from .initialize import get_model_parallel_world_size, get_model_parallel_rank
+
+from .layers import ColumnParallelLinear
+from .layers import RowParallelLinear
+from .layers import VocabParallelEmbedding
+from .layers import (set_tensor_model_parallel_attributes,
+                     set_defaults_if_not_set_tensor_model_parallel_attributes,
+                     copy_tensor_model_parallel_attributes)
+                     
+from .mappings import copy_to_tensor_model_parallel_region
+from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import reduce_from_tensor_model_parallel_region
+from .mappings import scatter_to_tensor_model_parallel_region
+
+from .random import checkpoint
+from .random import get_cuda_rng_tracker
+from .random import init_checkpointed_activations_memory_buffer
+from .random import model_parallel_cuda_manual_seed
+from .random import reset_checkpointed_activations_memory_buffer
+from .random import gather_split_1d_tensor
+from .random import split_tensor_into_1d_equal_chunks
+
+from .utils import divide
+from .utils import split_tensor_along_last_dim
--- a/megatron/mpu/cross_entropy.py
+++ b/megatron/mpu/cross_entropy.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+from .initialize import get_tensor_model_parallel_group
+from .initialize import get_tensor_model_parallel_rank
+from .initialize import get_tensor_model_parallel_world_size
+from .utils import VocabUtility
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(logits_max,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=get_tensor_model_parallel_group())
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_tensor_model_parallel_rank()
+        world_size = get_tensor_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(
+            partition_vocab_size, rank, world_size)
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
+                                 device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(predicted_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_tensor_model_parallel_group())
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_tensor_model_parallel_group())
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
+                                 device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (
+            1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None
+
+
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
--- a/megatron/mpu/data.py
+++ b/megatron/mpu/data.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_tensor_model_parallel_group
+from .initialize import get_tensor_model_parallel_rank
+from .initialize import get_tensor_model_parallel_src_rank
+
+
+_MAX_DATA_DIM = 5
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, '{} has data type {} which '\
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(),
+                                group=get_tensor_model_parallel_group())
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
+                                                                          data)
+
+    # Pack on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(total_numel,
+                                   device=torch.cuda.current_device(),
+                                   dtype=datatype)
+
+    # Broadcast
+    torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(),
+                                group=get_tensor_model_parallel_group())
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
--- a/megatron/mpu/initialize.py
+++ b/megatron/mpu/initialize.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Model and data parallel groups."""
+
+import torch
+
+from .utils import ensure_divisibility
+
+
+# Intra-layer model parallel group that the current rank belongs to.
+_TENSOR_MODEL_PARALLEL_GROUP = None
+# Inter-layer model parallel group that the current rank belongs to.
+_PIPELINE_MODEL_PARALLEL_GROUP = None
+# Model parallel group (both intra- and pipeline) that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Embedding group.
+_EMBEDDING_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+
+# These values enable us to change the mpu sizes on the fly.
+_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_TENSOR_MODEL_PARALLEL_RANK = None
+_MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+
+# A list of global ranks for each pipeline group to ease calculation of the source
+# rank when broadcasting from the first or last pipeline stage
+_PIPELINE_GLOBAL_RANKS = None
+
+def is_unitialized():
+    """Useful for code segments that may be accessed with or without mpu initialization"""
+    return _DATA_PARALLEL_GROUP is None
+
+
+def initialize_model_parallel(tensor_model_parallel_size_=1,
+                              pipeline_model_parallel_size_=1,
+                              virtual_pipeline_model_parallel_size_=None):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used to parallelize model tensor.
+        pipeline_model_parallel_size: number of GPUs used to parallelize model pipeline.
+
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print('> initializing tensor model parallel with size {}'.format(
+            tensor_model_parallel_size_))
+        print('> initializing pipeline model parallel with size {}'.format(
+            pipeline_model_parallel_size_))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    tensor_model_parallel_size = min(tensor_model_parallel_size_, world_size)
+    pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size)
+    ensure_divisibility(world_size,
+                        tensor_model_parallel_size * pipeline_model_parallel_size)
+    data_parallel_size = world_size // (tensor_model_parallel_size *
+                                        pipeline_model_parallel_size)
+
+    num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
+    num_data_parallel_groups = world_size // data_parallel_size
+
+    if virtual_pipeline_model_parallel_size_ is not None:
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size_
+
+    rank = torch.distributed.get_rank()
+
+    # Build the data-parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, \
+        'data parallel group is already initialized'
+    all_data_parallel_group_ranks = []
+    for i in range(pipeline_model_parallel_size):
+        start_rank = i * num_pipeline_model_parallel_groups
+        end_rank = (i + 1) * num_pipeline_model_parallel_groups
+        for j in range(tensor_model_parallel_size):
+            ranks = range(start_rank + j, end_rank,
+                          tensor_model_parallel_size)
+            all_data_parallel_group_ranks.append(list(ranks))
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                _DATA_PARALLEL_GROUP = group
+
+    # Build the model-parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, \
+        'model parallel group is already initialized'
+    for i in range(data_parallel_size):
+        ranks = [data_parallel_group_ranks[i]
+                 for data_parallel_group_ranks in all_data_parallel_group_ranks]
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _MODEL_PARALLEL_GROUP = group
+
+    # Build the tensor model-parallel groups.
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    assert _TENSOR_MODEL_PARALLEL_GROUP is None, \
+        'tensor model parallel group is already initialized'
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = range(i * tensor_model_parallel_size,
+                      (i + 1) * tensor_model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _TENSOR_MODEL_PARALLEL_GROUP = group
+
+    # Build the pipeline model-parallel groups and embedding groups
+    # (first and last rank in each pipeline model-parallel group).
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    global _PIPELINE_GLOBAL_RANKS
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, \
+        'pipeline model parallel group is already initialized'
+    global _EMBEDDING_GROUP
+    assert _EMBEDDING_GROUP is None, \
+        'embedding group is already initialized'
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = range(i, world_size,
+                      num_pipeline_model_parallel_groups)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _PIPELINE_MODEL_PARALLEL_GROUP = group
+            _PIPELINE_GLOBAL_RANKS = ranks
+        # Setup embedding group (to exchange gradients between
+        # first and last stages).
+        if len(ranks) > 1:
+            embedding_ranks = [ranks[0], ranks[-1]]
+        else:
+            embedding_ranks = ranks
+        group = torch.distributed.new_group(embedding_ranks)
+        if rank in embedding_ranks:
+            _EMBEDDING_GROUP = group
+
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _TENSOR_MODEL_PARALLEL_GROUP is None or \
+        _PIPELINE_MODEL_PARALLEL_GROUP is None or \
+        _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_tensor_model_parallel_group():
+    """Get the tensor model parallel group the caller rank belongs to."""
+    assert _TENSOR_MODEL_PARALLEL_GROUP is not None, \
+        'intra_layer_model parallel group is not initialized'
+    return _TENSOR_MODEL_PARALLEL_GROUP
+
+
+def get_pipeline_model_parallel_group():
+    """Get the pipeline model parallel group the caller rank belongs to."""
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, \
+        'pipeline_model parallel group is not initialized'
+    return _PIPELINE_MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_embedding_group():
+    """Get the embedding group the caller rank belongs to."""
+    assert _EMBEDDING_GROUP is not None, \
+        'embedding group is not initialized'
+    return _EMBEDDING_GROUP
+
+
+def set_tensor_model_parallel_world_size(world_size):
+    """Set the tensor model parallel size"""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def set_pipeline_model_parallel_world_size(world_size):
+    """Set the pipeline model parallel size"""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_model_parallel_world_size():
+    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_world_size is only supported if PP is disabled"
+    return get_tensor_model_parallel_world_size()
+
+
+def get_pipeline_model_parallel_world_size():
+    """Return world size for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
+
+
+def set_tensor_model_parallel_rank(rank):
+    """Set tensor model parallel rank."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
+
+
+def set_pipeline_model_parallel_rank(rank):
+    """Set pipeline model parallel rank."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank
+
+
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_model_parallel_rank():
+    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_rank is only supported if PP is disabled"
+    return get_tensor_model_parallel_rank()
+
+
+def get_pipeline_model_parallel_rank():
+    """Return my rank for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
+
+
+def is_pipeline_first_stage(ignore_virtual=False):
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        if get_virtual_pipeline_model_parallel_world_size() is not None and \
+            get_virtual_pipeline_model_parallel_rank() != 0:
+            return False
+    return get_pipeline_model_parallel_rank() == 0
+
+
+def is_pipeline_last_stage(ignore_virtual=False):
+    """Return True if in the last pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        virtual_pipeline_model_parallel_world_size = \
+            get_virtual_pipeline_model_parallel_world_size()
+        if virtual_pipeline_model_parallel_world_size is not None and \
+            get_virtual_pipeline_model_parallel_rank() != (
+                virtual_pipeline_model_parallel_world_size - 1):
+            return False
+    return get_pipeline_model_parallel_rank() == (
+        get_pipeline_model_parallel_world_size() - 1)
+
+
+def get_virtual_pipeline_model_parallel_rank():
+    """Return the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+
+
+def set_virtual_pipeline_model_parallel_rank(rank):
+    """Set the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = rank
+
+
+def get_virtual_pipeline_model_parallel_world_size():
+    """Return the virtual pipeline-parallel world size."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+
+
+def get_tensor_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_tensor_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_pipeline_model_parallel_first_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    return _PIPELINE_GLOBAL_RANKS[0]
+
+
+def get_pipeline_model_parallel_last_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    last_rank_local = get_pipeline_model_parallel_world_size() - 1
+    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
+
+def get_pipeline_model_parallel_next_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
+
+
+def get_pipeline_model_parallel_prev_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    _TENSOR_MODEL_PARALLEL_GROUP = None
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    _PIPELINE_MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+from functools import partial
+
+from .initialize import get_tensor_model_parallel_rank
+from .initialize import get_tensor_model_parallel_world_size
+from .mappings import copy_to_tensor_model_parallel_region
+from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import reduce_from_tensor_model_parallel_region
+from .mappings import scatter_to_tensor_model_parallel_region
+from .random import get_cuda_rng_tracker
+from .utils import divide
+from .utils import split_tensor_along_last_dim
+from .utils import VocabUtility
+from ..model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+from megatron import get_args, mpu
+import deepspeed.runtime.activation_checkpointing.checkpointing as ds_checkpointing
+
+
+_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
+                                      'partition_dim': -1,
+                                      'partition_stride': 1}
+
+
+def param_is_not_tensor_parallel_duplicate(param):
+    return (hasattr(param, 'tensor_model_parallel') and
+            param.tensor_model_parallel) or (
+                get_tensor_model_parallel_rank() == 0)
+
+
+def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
+    # Make sure the attributes are not set.
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        assert not hasattr(tensor, attribute)
+    # Set the attributes.
+    setattr(tensor, 'tensor_model_parallel', is_parallel)
+    setattr(tensor, 'partition_dim', dim)
+    setattr(tensor, 'partition_stride', stride)
+
+
+def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
+    def maybe_set(attribute, value):
+        if not hasattr(tensor, attribute):
+            setattr(tensor, attribute, value)
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
+
+
+def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
+    def maybe_copy(attribute):
+        if hasattr(source_tensor, attribute):
+            setattr(destination_tensor, attribute,
+                    getattr(source_tensor, attribute))
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_copy(attribute)
+
+
+def _initialize_affine_weight_gpu(weight, init_method,
+                                  partition_dim, stride=1):
+    """Initialize affine weight for model parallel on GPU."""
+
+    set_tensor_model_parallel_attributes(tensor=weight,
+                                         is_parallel=True,
+                                         dim=partition_dim,
+                                         stride=stride)
+
+    if ds_checkpointing.is_configured():
+        global get_cuda_rng_tracker
+        get_cuda_rng_tracker = ds_checkpointing.get_cuda_rng_tracker
+
+    with get_cuda_rng_tracker().fork():
+        init_method(weight)
+
+
+def _initialize_affine_weight_cpu(weight, output_size, input_size,
+                                  per_partition_size, partition_dim,
+                                  init_method, stride=1,
+                                  return_master_weight=False):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+
+    set_tensor_model_parallel_attributes(tensor=weight,
+                                         is_parallel=True,
+                                         dim=partition_dim,
+                                         stride=stride)
+
+    # Initialize master weight
+    master_weight = torch.empty(output_size, input_size,
+                                dtype=torch.float,
+                                requires_grad=False)
+    init_method(master_weight)
+    args = get_args()
+    master_weight = master_weight.to(dtype=args.params_dtype)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(master_weight, per_partition_per_stride_size,
+                              dim=partition_dim)
+    rank = get_tensor_model_parallel_rank()
+    world_size = get_tensor_model_parallel_world_size()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+def xavier_uniform_tensor_parallel_(tensor, gain=1., tp_degree=1):
+    r"""
+    This is a modified torch.nn.init.xavier_uniform_ with changes to support
+    partitioned on the vocab size dim embedding with tensor parallel.
+
+    Additional args:
+    - tp_degree: degree of tensor parallel
+
+    Note: the code assumes all partitions are equal in size
+    """
+    # receptive_field_size=1 as dim==2, so we don't need init._calculate_fan_in_and_fan_out
+    fan_out, fan_in = tensor.shape
+    fan_out *= tp_degree # tp splits on num_embeddings dim
+
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    return torch.nn.init._no_grad_uniform_(tensor, -a, a)
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim,
+                 init_method=init.xavier_normal_):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the defaults for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        # Divide the weight matrix along the vocabulary dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, get_tensor_model_parallel_rank(),
+                self.tensor_model_parallel_size)
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+            self.vocab_start_index
+
+        # Allocate weights and initialize.
+        args = get_args()
+
+        # only the first stage embedding runs this class' forward. The head's embedding does its own
+        # thing, so don't waste memory allocating LN weights.
+        if mpu.is_pipeline_first_stage() and (args.use_bnb_optimizer or args.embed_layernorm):
+            self.norm = LayerNorm(embedding_dim)
+
+        if args.use_bnb_optimizer:
+            # for BNB we ignore the passed init_method and use torch.nn.init.xavier_uniform_
+            # modified to calculate std on the unpartitioned embedding
+            init_method = partial(xavier_uniform_tensor_parallel_, tp_degree=self.tensor_model_parallel_size)
+
+        if args.use_cpu_initialization:
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                dtype=args.params_dtype))
+            _initialize_affine_weight_cpu(
+                self.weight, self.num_embeddings, self.embedding_dim,
+                self.num_embeddings_per_partition, 0, init_method)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                device=torch.cuda.current_device(), dtype=args.params_dtype))
+            _initialize_affine_weight_gpu(self.weight, init_method,
+                                          partition_dim=0, stride=1)
+
+        if args.use_bnb_optimizer:
+            from bitsandbytes.optim import GlobalOptimManager
+            GlobalOptimManager.get_instance().override_config(self.weight, 'optim_bits', 32)
+            GlobalOptimManager.get_instance().register_parameters(self.weight)
+
+
+    def forward(self, input_):
+        if torch.any(input_ >= self.num_embeddings):
+            raise ValueError(f"There is an input id in the input that is greater than the highest possible input id.\nInput: {input_}\nnum_embeddings: {self.num_embeddings}")
+
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | \
+                         (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            # input_ is garanted to be in the range [0:self.vocab_end_index - self.vocab_start_index] thanks to the first check
+            masked_input = input_
+
+        # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_tensor_model_parallel_region(output_parallel)
+
+        if hasattr(self, 'norm'):
+            output = self.norm(output)
+
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gether on output and make Y avaiable
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip
+                       adding bias but instead return it.
+    """
+
+    def __init__(self, input_size, output_size, bias=True, gather_output=True,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+        self.skip_bias_add = skip_bias_add
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        args = get_args()
+        if args.use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size_per_partition,
+                                                self.input_size,
+                                                dtype=args.params_dtype))
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight, self.output_size, self.input_size,
+                self.output_size_per_partition, 0, init_method,
+                stride=stride, return_master_weight=keep_master_weight_for_test)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.output_size_per_partition, self.input_size,
+                device=torch.cuda.current_device(), dtype=args.params_dtype))
+            _initialize_affine_weight_gpu(self.weight, init_method,
+                                          partition_dim=0, stride=stride)
+
+        if bias:
+            if args.use_cpu_initialization:
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition, dtype=args.params_dtype))
+            else:
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=args.params_dtype))
+            set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
+
+        bias = self.bias if not self.skip_bias_add else None
+        output_parallel = F.linear(input_parallel, self.weight, bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_tensor_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip
+                       adding bias but instead return it.
+    """
+
+    def __init__(self, input_size, output_size, bias=True,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+        self.skip_bias_add = skip_bias_add
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        args = get_args()
+        if args.use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size,
+                                                self.input_size_per_partition,
+                                                dtype=args.params_dtype))
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight, self.output_size, self.input_size,
+                self.input_size_per_partition, 1, init_method,
+                stride=stride, return_master_weight=keep_master_weight_for_test)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.output_size, self.input_size_per_partition,
+                device=torch.cuda.current_device(), dtype=args.params_dtype))
+            _initialize_affine_weight_gpu(self.weight, init_method,
+                                          partition_dim=1, stride=stride)
+        if bias:
+            if args.use_cpu_initialization:
+                self.bias = Parameter(torch.empty(self.output_size,
+                                                  dtype=args.params_dtype))
+            else:
+                self.bias = Parameter(torch.empty(
+                    self.output_size, device=torch.cuda.current_device(),
+                    dtype=args.params_dtype))
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        self.bias_tp_auto_sync = args.sync_tp_duplicated_parameters
+
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        # All-reduce across all the partitions.
+        output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+
+        if self.bias_tp_auto_sync:
+            torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+
+        if not self.skip_bias_add:
+            output = output_ + self.bias if self.bias is not None else output_
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.bias
+        return output, output_bias
--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_tensor_model_parallel_group, get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+
+    # Bypass the function if we are using only 1 GPU.
+    if get_tensor_model_parallel_world_size()==1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
+
+    return input_
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size==1:
+        return input_
+
+    # Split along last dimension.
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_tensor_model_parallel_rank()
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size==1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = get_tensor_model_parallel_rank()
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-reduce the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+def copy_to_tensor_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_tensor_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_tensor_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_tensor_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import contextlib
+
+import torch
+from torch import _C
+from torch.cuda import _lazy_call, device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+
+from megatron import get_args
+from megatron.memory import allocate_mem_buff
+
+from .initialize import get_data_parallel_rank
+from .initialize import get_tensor_model_parallel_group
+from .initialize import get_tensor_model_parallel_rank
+from .initialize import get_tensor_model_parallel_world_size
+
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+
+
+# Whether apply model parallelsim to checkpointed hidden states.
+_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
+
+
+def init_checkpointed_activations_memory_buffer():
+    """Initialize the memory buffer for the checkpointed activations."""
+    args = get_args()
+
+    upper_bound_sequence_length = max(
+        args.seq_length if args.seq_length is not None else 0,
+        args.decoder_seq_length if args.decoder_seq_length is not None else 0
+    )
+    per_layer = args.micro_batch_size * upper_bound_sequence_length * \
+                args.hidden_size // args.tensor_model_parallel_size
+    assert args.num_layers % args.checkpoint_num_layers == 0, \
+        'number of layers is not divisible by checkpoint-num-layers'
+    num_checkpointer_layers = args.num_layers // args.checkpoint_num_layers
+    numel = per_layer * num_checkpointer_layers
+    dtype = torch.half
+    if not args.fp16:
+        dtype = torch.float
+
+    global _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
+    assert _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is None, \
+        'checkpointed activations memory buffer is already allocated.'
+    _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = allocate_mem_buff(
+        'checkpointed activations', numel, dtype, track_usage=False)
+
+
+def reset_checkpointed_activations_memory_buffer():
+    """Reset the memory used for checkpointing."""
+    if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+        _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.reset()
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+def split_tensor_into_1d_equal_chunks(tensor):
+    """Break a tensor into equal 1D chunks."""
+    data = tensor.view(-1)
+    partition_size = torch.numel(data) // get_tensor_model_parallel_world_size()
+    start_index = partition_size * get_tensor_model_parallel_rank()
+    end_index = start_index + partition_size
+    return data[start_index:end_index]
+
+
+def gather_split_1d_tensor(tensor):
+    """Opposite of above function, gather values from model parallel ranks."""
+    world_size = get_tensor_model_parallel_world_size()
+    numel = torch.numel(tensor)
+    numel_gathered = world_size * numel
+    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
+                           device=torch.cuda.current_device(),
+                           requires_grad=False)
+    chunks = [gathered[i*numel:(i+1)*numel] for i in range(world_size)]
+    torch.distributed.all_gather(chunks, tensor,
+                                 group=get_tensor_model_parallel_group())
+    return gathered
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            print(name, self.states_)
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-tensor-model-parallel regions.
+        tensor-model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
+    # Data parallel gets the original seed.
+    data_parallel_seed = seed
+
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel cuda seeds on global rank {}, '
+              'model parallel rank {}, and data parallel rank {} with '
+              'model parallel seed: {} and data parallel seed: {}'.format(
+                  torch.distributed.get_rank(), get_tensor_model_parallel_rank(),
+                  get_data_parallel_rank(), tensor_model_parallel_seed,
+                  data_parallel_seed), flush=True)
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                tensor_model_parallel_seed)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        ctx.run_function = run_function
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        with torch.no_grad():
+            outputs = run_function(*args)
+
+        # Divide hidden states across model parallel group and only keep
+        # the chunk corresponding to the current rank.
+        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+            ctx.input_0_shape = args[0].data.shape
+            args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
+            args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(
+                args[0].data)
+
+        # Store everything.
+        ctx.save_for_backward(*args)
+
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                               "please use .backward() if possible")
+        inputs = ctx.saved_tensors
+        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+            inputs[0].data = gather_split_1d_tensor(inputs[0].data)
+            inputs[0].data = inputs[0].data.view(ctx.input_0_shape)
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        # Compute the forward pass.
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
+                      for inp in detached_inputs)
+        return (None,) + grads
+
+
+def checkpoint(function, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, *args)
--- a/megatron/mpu/tests/__init__.py
+++ b/megatron/mpu/tests/__init__.py
--- a/megatron/mpu/tests/commons.py
+++ b/megatron/mpu/tests/commons.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import numpy
+import torch
+
+import mpu
+
+
+class IdentityLayer(torch.nn.Module):
+    def __init__(self, size, scale=1.0):
+        super(IdentityLayer, self).__init__()
+        self.weight = torch.nn.Parameter(scale * torch.randn(size))
+
+    def forward(self):
+        return self.weight
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(backend='nccl'):
+    """Initialize torch.distributed."""
+    # Get local rank in case it is provided.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local_rank', type=int, default=None,
+                        help='local rank passed from distributed launcher')
+    args = parser.parse_args()
+    local_rank = args.local_rank
+
+    # Get rank and world size.
+    rank = int(os.getenv('RANK', '0'))
+    world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    print('> initializing torch.distributed with local rank: {}, '
+          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
+
+    # Set the device id.
+    device = rank % torch.cuda.device_count()
+    if local_rank is not None:
+        device = local_rank
+    torch.cuda.set_device(device)
+
+    # Call the init process.
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
+
+
+def print_separator(message):
+    torch.distributed.barrier()
+    filler_len = (78 - len(message)) // 2
+    filler = '-' * filler_len
+    string = '\n' + filler + ' {} '.format(message) + filler
+    if torch.distributed.get_rank() == 0:
+        print(string, flush=True)
+    torch.distributed.barrier()
--- a/megatron/mpu/tests/test_cross_entropy.py
+++ b/megatron/mpu/tests/test_cross_entropy.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import set_random_seed
+from commons import IdentityLayer
+from commons import print_separator
+from commons import initialize_distributed
+from mpu.cross_entropy import vocab_parallel_cross_entropy
+import mpu
+import torch.nn.functional as F
+import torch
+import random
+import sys
+sys.path.append("../..")
+
+
+def torch_cross_entropy(batch_size, seq_length, vocab_size,
+                        logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
+                           target.view(-1),
+                           reduction='none').view_as(target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def mpu_cross_entropy(batch_size, seq_length, vocab_size,
+                      logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits)
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def test_cross_entropy(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cross entropy with model parallel size {} ...'.
+              format(tensor_model_parallel_size))
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    batch_size = 13
+    seq_length = 17
+    vocab_size_per_partition = 11
+    logits_scale = 1000.0
+    vocab_size = vocab_size_per_partition * tensor_model_parallel_size
+    seed = 1234
+
+    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
+                                                 vocab_size, logits_scale,
+                                                 seed)
+    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
+                                           vocab_size, logits_scale,
+                                           seed)
+
+    error = loss_torch.sub_(loss_mpu).abs().max()
+    print('   max error in loss on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = grad_torch.sub_(grad_mpu).abs().max()
+    print('   max error in grad on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_tensor_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test cross entropy')
+        test_cross_entropy(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
--- a/megatron/mpu/tests/test_data.py
+++ b/megatron/mpu/tests/test_data.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import print_separator
+from commons import initialize_distributed
+from mpu import data as data_utils
+import mpu
+import torch
+import functools
+import operator
+import sys
+sys.path.append("../..")
+
+
+def test_broadcast_data(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing broadcast_data with model parallel size {} ...'.
+              format(tensor_model_parallel_size))
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    key_size_t = {'key1': [7, 11],
+                  'key2': [8, 2, 1],
+                  'key3': [13],
+                  'key4': [5, 1, 2],
+                  'key5': [5, 12]}
+    keys = list(key_size_t.keys())
+
+    data = {}
+    data_t = {}
+    for key in key_size_t:
+        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
+        data_t[key] = data[key].clone()
+    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
+    data_t['keyX'] = data['keyX'].clone()
+    if mpu.get_tensor_model_parallel_rank() != 0:
+        data = None
+
+    data_utils._check_data_types(keys, data_t, torch.int64)
+    key_size, key_numel, \
+        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
+    for key in keys:
+        assert key_size[key] == key_size_t[key]
+    total_numel_t = 0
+    for key in keys:
+        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
+        assert key_numel[key] == target_size
+        total_numel_t += target_size
+    assert total_numel == total_numel_t
+
+    data_b = data_utils.broadcast_data(keys, data, torch.int64)
+    for key in keys:
+        tensor = data_t[key].cuda()
+        assert data_b[key].sub(tensor).abs().max() == 0
+
+    # Reset groups
+    mpu.destroy_tensor_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test test broadcast data')
+        test_broadcast_data(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2