initial llama

0211193c · zhuwenwen · 0211193c · 0211193c · 0211193c · 0211193c
Commit 0211193c authored Aug 17, 2023 by zhuwenwen
20 changed files
--- a/3rdparty/Megatron-LM/megatron/model/biencoder_model.py
+++ b/3rdparty/Megatron-LM/megatron/model/biencoder_model.py
+import os
+import torch
+import sys
+from megatron import get_args, print_rank_0
+from megatron.checkpointing import fix_query_key_value_ordering
+from megatron.checkpointing import get_checkpoint_tracker_filename
+from megatron.checkpointing import get_checkpoint_name
+from megatron import mpu, get_tokenizer
+from megatron.model.bert_model import bert_position_ids
+from megatron.model.enums import AttnMaskType
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+def get_model_provider(only_query_model=False, only_context_model=False,
+        biencoder_shared_query_context_model=False):
+    def model_provider(pre_process=True, post_process=True):
+        """Build the model."""
+        print_rank_0('building Bienoder model ...')
+        model = biencoder_model_provider(only_query_model=only_query_model,
+                only_context_model = only_context_model,
+                biencoder_shared_query_context_model = \
+                biencoder_shared_query_context_model,
+                pre_process=pre_process, post_process=post_process)
+        return model
+    return model_provider
+def biencoder_model_provider(only_query_model=False,
+                             only_context_model=False,
+                             biencoder_shared_query_context_model=False,
+                             pre_process=True,
+                             post_process=True):
+    """Build the model."""
+    assert mpu.get_tensor_model_parallel_world_size() == 1 and \
+        mpu.get_pipeline_model_parallel_world_size() == 1, \
+        "Model parallel size > 1 not supported for ICT"
+    print_rank_0('building BiEncoderModel...')
+    # simpler to just keep using 2 tokentypes since
+    # the LM we initialize with has 2 tokentypes
+    model = BiEncoderModel(
+        num_tokentypes=2,
+        parallel_output=False,
+        only_query_model=only_query_model,
+        only_context_model=only_context_model,
+        biencoder_shared_query_context_model=\
+        biencoder_shared_query_context_model,
+        pre_process=pre_process,
+        post_process=post_process)
+    return model
+class BiEncoderModel(MegatronModule):
+    """Bert-based module for Biencoder model."""
+    def __init__(self,
+                 num_tokentypes=1,
+                 parallel_output=True,
+                 only_query_model=False,
+                 only_context_model=False,
+                 biencoder_shared_query_context_model=False,
+                 pre_process=True,
+                 post_process=True):
+        super(BiEncoderModel, self).__init__()
+        args = get_args()
+        bert_kwargs = dict(
+            num_tokentypes=num_tokentypes,
+            parallel_output=parallel_output,
+            pre_process=pre_process,
+            post_process=post_process)
+        self.biencoder_shared_query_context_model = \
+            biencoder_shared_query_context_model
+        assert not (only_context_model and only_query_model)
+        self.use_context_model = not only_query_model
+        self.use_query_model = not only_context_model
+        self.biencoder_projection_dim = args.biencoder_projection_dim
+        if self.biencoder_shared_query_context_model:
+            self.model = PretrainedBertModel(**bert_kwargs)
+            self._model_key = 'shared_model'
+            self.query_model, self.context_model = self.model, self.model
+        else:
+            if self.use_query_model:
+                # this model embeds (pseudo-)queries - Embed_input in the paper
+                self.query_model = PretrainedBertModel(**bert_kwargs)
+                self._query_key = 'query_model'
+            if self.use_context_model:
+                # this model embeds evidence blocks - Embed_doc in the paper
+                self.context_model = PretrainedBertModel(**bert_kwargs)
+                self._context_key = 'context_model'
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        # this is just a placeholder and will be needed when model
+        # parallelism will be used
+        # self.language_model.set_input_tensor(input_tensor)
+        return
+    def forward(self, query_tokens, query_attention_mask, query_types,
+                context_tokens, context_attention_mask, context_types):
+        """Run a forward pass for each of the models and
+        return the respective embeddings."""
+        if self.use_query_model:
+            query_logits = self.embed_text(self.query_model,
+                                           query_tokens,
+                                           query_attention_mask,
+                                           query_types)
+        else:
+            raise ValueError("Cannot embed query without the query model.")
+        if self.use_context_model:
+            context_logits = self.embed_text(self.context_model,
+                                             context_tokens,
+                                             context_attention_mask,
+                                             context_types)
+        else:
+            raise ValueError("Cannot embed block without the block model.")
+        return query_logits, context_logits
+    @staticmethod
+    def embed_text(model, tokens, attention_mask, token_types):
+        """Embed a batch of tokens using the model"""
+        logits = model(tokens,
+                              attention_mask,
+                              token_types)
+        return logits
+    def state_dict_for_save_checkpoint(self, destination=None, \
+        prefix='', keep_vars=False):
+        """Save dict with state dicts of each of the models."""
+        state_dict_ = {}
+        if self.biencoder_shared_query_context_model:
+            state_dict_[self._model_key] = \
+                self.model.state_dict_for_save_checkpoint(destination,
+                                                          prefix,
+                                                          keep_vars)
+        else:
+            if self.use_query_model:
+                state_dict_[self._query_key] = \
+                    self.query_model.state_dict_for_save_checkpoint(
+                        destination, prefix, keep_vars)
+            if self.use_context_model:
+                state_dict_[self._context_key] = \
+                    self.context_model.state_dict_for_save_checkpoint(
+                        destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Load the state dicts of each of the models"""
+        if self.biencoder_shared_query_context_model:
+            print_rank_0("Loading shared query-context model")
+            self.model.load_state_dict(state_dict[self._model_key], \
+                strict=strict)
+        else:
+            if self.use_query_model:
+                print_rank_0("Loading query model")
+                self.query_model.load_state_dict( \
+                    state_dict[self._query_key], strict=strict)
+            if self.use_context_model:
+                print_rank_0("Loading context model")
+                self.context_model.load_state_dict( \
+                    state_dict[self._context_key], strict=strict)
+    def init_state_dict_from_bert(self):
+        """Initialize the state from a pretrained BERT model
+        on iteration zero of ICT pretraining"""
+        args = get_args()
+        if args.bert_load is None:
+            print_rank_0("bert-load argument is None")
+            return
+        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
+        if not os.path.isfile(tracker_filename):
+            raise FileNotFoundError("Could not find BERT checkpoint")
+        with open(tracker_filename, 'r') as f:
+            iteration = int(f.read().strip())
+            assert iteration > 0
+        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading BERT checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+        # Load the checkpoint.
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except ModuleNotFoundError:
+            from megatron.fp16_deprecated import loss_scaler
+            # For backward compatibility.
+            print_rank_0(' > deserializing using the old code structure ...')
+            sys.modules['fp16.loss_scaler'] = sys.modules[
+                'megatron.fp16_deprecated.loss_scaler']
+            sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
+                'megatron.fp16_deprecated.loss_scaler']
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+            sys.modules.pop('fp16.loss_scaler', None)
+            sys.modules.pop('megatron.fp16.loss_scaler', None)
+        except BaseException:
+            print_rank_0('could not load the BERT checkpoint')
+            sys.exit()
+        checkpoint_version = state_dict.get('checkpoint_version', 0)
+        # load the LM state dict into each model
+        model_dict = state_dict['model']['language_model']
+        if self.biencoder_shared_query_context_model:
+            self.model.language_model.load_state_dict(model_dict)
+            fix_query_key_value_ordering(self.model, checkpoint_version)
+        else:
+            if self.use_query_model:
+                self.query_model.language_model.load_state_dict(model_dict)
+                # give each model the same ict_head to begin with as well
+                if self.biencoder_projection_dim > 0:
+                    query_proj_state_dict = \
+                        self.state_dict_for_save_checkpoint()\
+                        [self._query_key]['projection_enc']
+                fix_query_key_value_ordering(self.query_model, checkpoint_version)
+            if self.use_context_model:
+                self.context_model.language_model.load_state_dict(model_dict)
+                if self.query_model is not None and \
+                    self.biencoder_projection_dim > 0:
+                    self.context_model.projection_enc.load_state_dict\
+                        (query_proj_state_dict)
+                fix_query_key_value_ordering(self.context_model, checkpoint_version)
+class PretrainedBertModel(MegatronModule):
+    """BERT-based encoder for queries or contexts used for
+    learned information retrieval."""
+    def __init__(self, num_tokentypes=2,
+            parallel_output=True, pre_process=True, post_process=True):
+        super(PretrainedBertModel, self).__init__()
+        args = get_args()
+        tokenizer = get_tokenizer()
+        self.pad_id = tokenizer.pad
+        self.biencoder_projection_dim = args.biencoder_projection_dim
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(
+            args.init_method_std, args.num_layers)
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        if args.biencoder_projection_dim > 0:
+            self.projection_enc = get_linear_layer(args.hidden_size,
+                                                   args.biencoder_projection_dim,
+                                                   init_method)
+            self._projection_enc_key = 'projection_enc'
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = attention_mask.unsqueeze(1)
+        #extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        position_ids = bert_position_ids(input_ids)
+        lm_output = self.language_model(input_ids,
+                                        position_ids,
+                                        extended_attention_mask,
+                                        tokentype_ids=tokentype_ids)
+        # This mask will be used in average-pooling and max-pooling
+        pool_mask = (input_ids == self.pad_id).unsqueeze(2)
+        # Taking the representation of the [CLS] token of BERT
+        pooled_output = lm_output[:, 0, :]
+        # Converting to float16 dtype
+        pooled_output = pooled_output.to(lm_output.dtype)
+        # Output.
+        if self.biencoder_projection_dim:
+            pooled_output = self.projection_enc(pooled_output)
+        return pooled_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        if self.biencoder_projection_dim > 0:
+            state_dict_[self._projection_enc_key] = \
+                self.projection_enc.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        print_rank_0("loading pretrained weights")
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.biencoder_projection_dim > 0:
+            print_rank_0("loading projection head weights")
+            self.projection_enc.load_state_dict(
+                state_dict[self._projection_enc_key], strict=strict)
--- a/3rdparty/Megatron-LM/megatron/model/classification.py
+++ b/3rdparty/Megatron-LM/megatron/model/classification.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classification model."""
+import torch
+from megatron import get_args, print_rank_last
+from megatron import mpu
+from megatron.model.enums import AttnMaskType
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+class Classification(MegatronModule):
+    def __init__(self,
+                 num_classes,
+                 num_tokentypes=2,
+                 pre_process=True,
+                 post_process=True):
+        super(Classification, self).__init__(share_word_embeddings=False)
+        args = get_args()
+        self.num_classes = num_classes
+        self.pre_process = pre_process
+        self.post_process = post_process
+        init_method = init_method_normal(args.init_method_std)
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers),
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        # Multi-choice head.
+        if self.post_process:
+            self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
+            self.classification_head = get_linear_layer(args.hidden_size,
+                                                        self.num_classes,
+                                                        init_method)
+            self._classification_head_key = 'classification_head'
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+    def forward(self, model_input, attention_mask, tokentype_ids=None):
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        input_ids = model_input
+        position_ids = bert_position_ids(input_ids)
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids
+        )
+        if self.post_process:
+            _, pooled_output = lm_output
+            classification_output = self.classification_dropout(pooled_output)
+            classification_logits = self.classification_head(classification_output)
+            # Reshape back to separate choices.
+            classification_logits = classification_logits.view(-1, self.num_classes)
+            return classification_logits
+        return lm_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.post_process:
+            state_dict_[self._classification_head_key] \
+                = self.classification_head.state_dict(
+                    destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process:
+            if self._classification_head_key in state_dict:
+                self.classification_head.load_state_dict(
+                    state_dict[self._classification_head_key], strict=strict)
+            else:
+                print_rank_last('***WARNING*** could not find {} in the checkpoint, '
+                                'initializing to random'.format(
+                                    self._classification_head_key))
--- a/3rdparty/Megatron-LM/megatron/model/distributed.py
+++ b/3rdparty/Megatron-LM/megatron/model/distributed.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC
+from abc import abstractmethod
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from megatron import get_args
+from megatron import mpu
+from .module import MegatronModule
+class MemoryBuffer:
+    def __init__(self, numel, dtype):
+        self.numel = numel
+        self.dtype = dtype
+        self.data = torch.zeros(self.numel,
+                                dtype=self.dtype,
+                                device=torch.cuda.current_device(),
+                                requires_grad=False)
+    def zero(self):
+        """Reset the buffer to zero."""
+        self.data.zero_()
+    def get(self, shape, start_index):
+        """Return a tensor with the input `shape` as a view into the
+        1-D data starting at `start_index`."""
+        end_index = start_index + shape.numel()
+        assert end_index <= self.numel, \
+            'requested tensor is out of the buffer range.'
+        buffer_tensor = self.data[start_index:end_index]
+        buffer_tensor = buffer_tensor.view(shape)
+        return buffer_tensor
+class DistributedDataParallelBase(MegatronModule, ABC):
+    """Abstract class for DDP."""
+    def __init__(self, module):
+        super(DistributedDataParallelBase, self).__init__()
+        # Keep a pointer to the model.
+        self.module = module
+    @abstractmethod
+    def allreduce_gradients(self):
+        pass
+    def forward(self, *inputs, **kwargs):
+        return self.module(*inputs, **kwargs)
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(destination, prefix,
+                                                          keep_vars)
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+class DistributedDataParallel(DistributedDataParallelBase):
+    """DDP with contiguous buffers options to storre and accumulate gradients.
+    This class:
+        - has the potential to reduce memory fragmentation.
+        - provides the option to do the gradient accumulation
+          in a type other than the params type (for example fp32)
+    Arguments:
+        module: input model.
+        accumulate_allreduce_grads_in_fp32: if true do the gradient accumulation
+            and the gradient all-reduce all in in float32. If this option is
+            true, we require `use_contiguous_buffers` to be true too.
+        use_contiguous_buffers: if true, use a contiguous buffer to store the
+            gradients.
+    """
+    def __init__(self, module,
+                 accumulate_allreduce_grads_in_fp32,
+                 use_contiguous_buffers):
+        super(DistributedDataParallel, self).__init__(module)
+        self.accumulate_allreduce_grads_in_fp32 \
+            = accumulate_allreduce_grads_in_fp32
+        self.use_contiguous_buffers = use_contiguous_buffers
+        # If we are using fp32-accumulate-allreduce explicitly
+        # this means we need main grads in a continous buffer.
+        if self.accumulate_allreduce_grads_in_fp32:
+            assert self.use_contiguous_buffers
+        # ===================================
+        # Rest of this part applies only to
+        # the case we use continuous buffers.
+        # ===================================
+        self._grad_buffers = None
+        if self.use_contiguous_buffers:
+            self._grad_buffers = {}
+            # Simple function to define buffer type.
+            def _get_buffer_type(param):
+                return torch.float if \
+                    self.accumulate_allreduce_grads_in_fp32 else param.dtype
+            # First calculate total number of elements per type.
+            type_num_elements = {}
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    dtype = _get_buffer_type(param)
+                    type_num_elements[dtype] = type_num_elements.get(dtype, 0) \
+                                               + param.data.nelement()
+            # Allocate the buffer.
+            for dtype, num_elements in type_num_elements.items():
+                self._grad_buffers[dtype] = MemoryBuffer(num_elements, dtype)
+            # Assume the back prop order is reverse the params order,
+            # store the start index for the gradients.
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    dtype = _get_buffer_type(param)
+                    type_num_elements[dtype] -= param.data.nelement()
+                    param.main_grad = self._grad_buffers[dtype].get(
+                        param.data.shape, type_num_elements[dtype])
+            # Backward hook.
+            # Accumalation function for the gradients. We need
+            # to store them so they don't go out of scope.
+            self.grad_accs = []
+            # Loop over all the parameters in the model.
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    # Expand so we get access to grad_fn.
+                    param_tmp = param.expand_as(param)
+                    # Get the gradient accumulator functtion.
+                    grad_acc = param_tmp.grad_fn.next_functions[0][0]
+                    grad_acc.register_hook(self._make_param_hook(param))
+                    self.grad_accs.append(grad_acc)
+    def _make_param_hook(self, param):
+        """Create the all-reduce hook for backprop."""
+        # Hook used for back-prop.
+        def param_hook(*unused):
+            # Add the gradient to the buffer.
+            if param.grad.data is not None:
+                param.main_grad.add_(param.grad.data)
+                # Now we can deallocate grad memory.
+                param.grad = None
+        return param_hook
+    def zero_grad_buffer(self):
+        """Set the grad buffer data to zero. Needs to be called at the
+        begining of each iteration."""
+        assert self._grad_buffers is not None, 'buffers are not initialized.'
+        for _, buffer_ in self._grad_buffers.items():
+            buffer_.zero()
+    def broadcast_params(self):
+        for param in self.module.parameters():
+            torch.distributed.broadcast(param.data,
+                                        src=mpu.get_data_parallel_src_rank(),
+                                        group=mpu.get_data_parallel_group())
+    def allreduce_gradients(self):
+        """Reduce gradients across data parallel ranks."""
+        # If we have buffers, simply reduce the data in the buffer.
+        if self._grad_buffers is not None:
+            for _, buffer_ in self._grad_buffers.items():
+                buffer_.data /= mpu.get_data_parallel_world_size()
+                torch.distributed.all_reduce(
+                    buffer_.data, group=mpu.get_data_parallel_group())
+        else:
+            # Otherwise, bucketize and all-reduce
+            buckets = {}
+            # Pack the buckets.
+            for param in self.module.parameters():
+                if param.requires_grad and param.grad is not None:
+                    tp = param.data.type()
+                    if tp not in buckets:
+                        buckets[tp] = []
+                    buckets[tp].append(param)
+                    param.main_grad = param.grad
+            # For each bucket, all-reduce and copy all-reduced grads.
+            for tp in buckets:
+                bucket = buckets[tp]
+                grads = [param.grad.data for param in bucket]
+                coalesced = _flatten_dense_tensors(grads)
+                coalesced /= mpu.get_data_parallel_world_size()
+                torch.distributed.all_reduce(
+                    coalesced, group=mpu.get_data_parallel_group())
+                for buf, synced in zip(grads, _unflatten_dense_tensors(
+                        coalesced, grads)):
+                    buf.copy_(synced)
--- a/3rdparty/Megatron-LM/megatron/model/enums.py
+++ b/3rdparty/Megatron-LM/megatron/model/enums.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import enum
+class ModelType(enum.Enum):
+    encoder_or_decoder = 1
+    encoder_and_decoder = 2
+class LayerType(enum.Enum):
+    encoder = 1
+    decoder = 2
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
--- a/3rdparty/Megatron-LM/megatron/model/fused_bias_gelu.py
+++ b/3rdparty/Megatron-LM/megatron/model/fused_bias_gelu.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff*g
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+bias_gelu_impl = GeLUFunction.apply
--- a/3rdparty/Megatron-LM/megatron/model/fused_layer_norm.py
+++ b/3rdparty/Megatron-LM/megatron/model/fused_layer_norm.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This code is copied fron NVIDIA apex:
+      https://github.com/NVIDIA/apex
+   with some changes. """
+import numbers
+import torch
+from torch.nn.parameter import Parameter
+from torch.nn import init
+import importlib
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+    HAVE_PERSIST_LAYER_NORM = True
+except:
+    HAVE_PERSIST_LAYER_NORM = False
+global fused_mix_prec_layer_norm_cuda
+fused_mix_prec_layer_norm_cuda = None
+class FusedLayerNormAffineFunction(torch.autograd.Function):
+  @staticmethod
+  def forward(ctx, input, weight, bias, normalized_shape, eps):
+    ctx.normalized_shape = normalized_shape
+    ctx.eps = eps
+    input_ = input.contiguous()
+    weight_ = weight.contiguous()
+    bias_ = bias.contiguous()
+    output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
+        input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
+    ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
+    return output
+  @staticmethod
+  def backward(ctx, grad_output):
+    input_, weight_, bias_, mean, invvar = ctx.saved_tensors
+    grad_input = grad_weight = grad_bias = None
+    grad_input, grad_weight, grad_bias \
+      = fused_mix_prec_layer_norm_cuda.backward_affine(
+        grad_output.contiguous(), mean, invvar,
+        input_, ctx.normalized_shape,
+        weight_, bias_, ctx.eps)
+    return grad_input, grad_weight, grad_bias, None, None
+class MixedFusedLayerNorm(torch.nn.Module):
+  def __init__(self, normalized_shape, eps=1e-5, no_persist_layer_norm=True):
+        super(MixedFusedLayerNorm, self).__init__()
+        global fused_mix_prec_layer_norm_cuda
+        fused_mix_prec_layer_norm_cuda = importlib.import_module(
+          "fused_mix_prec_layer_norm_cuda")
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
+            5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
+            24576, 25600, 30720, 32768, 40960, 49152, 65536]
+        if normalized_shape not in persist_ln_hidden_sizes or \
+                not HAVE_PERSIST_LAYER_NORM:
+            no_persist_layer_norm = True
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.weight = Parameter(torch.Tensor(*normalized_shape))
+        self.bias = Parameter(torch.Tensor(*normalized_shape))
+        self.reset_parameters()
+        self.no_persist_layer_norm = no_persist_layer_norm
+  def reset_parameters(self):
+    init.ones_(self.weight)
+    init.zeros_(self.bias)
+  def forward(self, input):
+    if self.no_persist_layer_norm:
+        return FusedLayerNormAffineFunction.apply(
+          input, self.weight, self.bias, self.normalized_shape, self.eps)
+    else:
+        return FastLayerNormFN.apply(
+          input, self.weight, self.bias, self.eps)
--- a/3rdparty/Megatron-LM/megatron/model/fused_softmax.py
+++ b/3rdparty/Megatron-LM/megatron/model/fused_softmax.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from megatron.model.enums import AttnMaskType
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+class ScaledSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        assert not (
+            self.input_in_fp16 and self.input_in_bf16
+        ), "both fp16 and bf16 flags cannot be active at the same time."
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+        assert (
+            self.scale is None or softmax_in_fp32
+        ), "softmax should be in fp32 when scaled"
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and 16 < sk <= 4096  # sk must be 16 ~ 2048
+            and sq % 4 == 0  # sq must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 4096:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+        if self.attn_mask_type == AttnMaskType.causal:
+            assert sq == sk, "causal mask is only for self attention"
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            if mask is not None:
+                return ScaledMaskedSoftmax.apply(input, mask, scale)
+            else:
+                return ScaledSoftmax.apply(input, scale)
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+        return probs
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
--- a/3rdparty/Megatron-LM/megatron/model/gpt_model.py
+++ b/3rdparty/Megatron-LM/megatron/model/gpt_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT-2 model."""
+import torch
+from megatron import get_args
+from megatron import mpu
+from .module import MegatronModule
+from .enums import AttnMaskType
+from .language_model import parallel_lm_logits
+from .language_model import get_language_model
+from .utils import init_method_normal
+from .utils import scaled_init_method_normal
+def post_language_model_processing(lm_output, labels, logit_weights,
+                                   parallel_output,
+                                   fp16_lm_cross_entropy):
+    # Output.
+    output = parallel_lm_logits(
+        lm_output,
+        logit_weights,
+        parallel_output)
+    if labels is None:
+        return output
+    else:
+        if fp16_lm_cross_entropy:
+            assert output.dtype == torch.half
+            loss = mpu.vocab_parallel_cross_entropy(output, labels)
+        else:
+            loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
+        return loss
+class GPTModel(MegatronModule):
+    """GPT-2 Language model."""
+    def __init__(self,
+                 num_tokentypes=0,
+                 parallel_output=True,
+                 pre_process=True,
+                 post_process=True):
+        super(GPTModel, self).__init__()
+        args = get_args()
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            encoder_attn_mask_type=AttnMaskType.causal,
+            init_method=init_method_normal(args.init_method_std),
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers),
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        self.initialize_word_embeddings(init_method_normal)
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+    def forward(self, input_ids, position_ids, attention_mask, labels=None,
+                tokentype_ids=None, inference_params=None):
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            inference_params=inference_params)
+        if self.post_process:
+            return post_language_model_processing(
+                lm_output, labels,
+                self.word_embeddings_weight(),
+                self.parallel_output,
+                self.fp16_lm_cross_entropy)
+        else:
+            return lm_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        # Save word_embeddings.
+        if self.post_process and not self.pre_process:
+            state_dict_[self._word_embeddings_for_head_key] \
+                = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        # Load word_embeddings.
+        if self.post_process and not self.pre_process:
+            self.word_embeddings.load_state_dict(
+                state_dict[self._word_embeddings_for_head_key], strict=strict)
+        if self._language_model_key in state_dict:
+            state_dict = state_dict[self._language_model_key]
+        self.language_model.load_state_dict(state_dict, strict=strict)
--- a/3rdparty/Megatron-LM/megatron/model/language_model.py
+++ b/3rdparty/Megatron-LM/megatron/model/language_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer based language model."""
+import torch
+import torch.nn.functional as F
+from megatron import get_args
+from megatron import mpu
+from .module import MegatronModule
+from megatron.model.enums import LayerType, AttnMaskType
+from megatron.model.transformer import ParallelTransformer
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal, scaled_init_method_normal
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
+                       bias=None):
+    """LM logits using word embedding weights."""
+    # Parallel logits.
+    input_parallel = mpu.copy_to_tensor_model_parallel_region(input_)
+    # Matrix multiply.
+    if bias is None:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
+    else:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
+    # Gather if needed.
+    if parallel_output:
+        return logits_parallel
+    return mpu.gather_from_tensor_model_parallel_region(logits_parallel)
+def get_language_model(num_tokentypes, add_pooler,
+                       encoder_attn_mask_type, init_method=None,
+                       scaled_init_method=None, add_encoder=True,
+                       add_decoder=False,
+                       decoder_attn_mask_type=AttnMaskType.causal,
+                       pre_process=True, post_process=True):
+    """Build language model and return along with the key to save."""
+    args = get_args()
+    if init_method is None:
+        init_method = init_method_normal(args.init_method_std)
+    if scaled_init_method is None:
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+    # Language model.
+    language_model = TransformerLanguageModel(
+        init_method,
+        scaled_init_method,
+        encoder_attn_mask_type,
+        num_tokentypes=num_tokentypes,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        decoder_attn_mask_type=decoder_attn_mask_type,
+        add_pooler=add_pooler,
+        pre_process=pre_process,
+        post_process=post_process
+    )
+    # key used for checkpoints.
+    language_model_key = 'language_model'
+    return language_model, language_model_key
+class Pooler(MegatronModule):
+    """Pooler layer.
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+    def __init__(self, hidden_size, init_method):
+        super(Pooler, self).__init__()
+        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+    def forward(self, hidden_states, sequence_index=0):
+        # hidden_states: [b, s, h]
+        # sequence_index: index of the token to pool.
+        pooled = hidden_states[:, sequence_index, :]
+        pooled = self.dense(pooled)
+        pooled = torch.tanh(pooled)
+        return pooled
+class Embedding(MegatronModule):
+    """Language model embeddings.
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 init_method,
+                 num_tokentypes=0):
+        super(Embedding, self).__init__()
+        self.hidden_size = hidden_size
+        self.init_method = init_method
+        self.num_tokentypes = num_tokentypes
+        args = get_args()
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, self.hidden_size,
+            init_method=self.init_method)
+        self._word_embeddings_key = 'word_embeddings'
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(
+            max_sequence_length, self.hidden_size)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        self.init_method(self.position_embeddings.weight)
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
+                                                           self.hidden_size)
+            # Initialize the token-type embeddings.
+            self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes),
+                  flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
+                                                       self.hidden_size)
+        # Initialize the token-type embeddings.
+        args = get_args()
+        self.init_method(self.tokentype_embeddings.weight)
+    def forward(self, input_ids, position_ids, tokentype_ids=None):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = words_embeddings + position_embeddings
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        else:
+            assert self.tokentype_embeddings is None
+        # Dropout.
+        embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(
+                destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_,
+                                                          strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it', flush=True)
+class TransformerLanguageModel(MegatronModule):
+    """Transformer language model.
+    Arguments:
+        transformer_hparams: transformer hyperparameters
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+    def __init__(self,
+                 init_method,
+                 output_layer_init_method,
+                 encoder_attn_mask_type,
+                 num_tokentypes=0,
+                 add_encoder=True,
+                 add_decoder=False,
+                 decoder_attn_mask_type=AttnMaskType.causal,
+                 add_pooler=False,
+                 pre_process=True,
+                 post_process=True):
+        super(TransformerLanguageModel, self).__init__()
+        args = get_args()
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.hidden_size = args.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method = init_method
+        self.add_encoder = add_encoder
+        self.encoder_attn_mask_type = encoder_attn_mask_type
+        self.add_decoder = add_decoder
+        self.decoder_attn_mask_type = decoder_attn_mask_type
+        self.add_pooler = add_pooler
+        self.encoder_hidden_state = None
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = Embedding(self.hidden_size,
+                                       args.padded_vocab_size,
+                                       args.max_position_embeddings,
+                                       args.hidden_dropout,
+                                       self.init_method,
+                                       self.num_tokentypes)
+            self._embedding_key = 'embedding'
+        # Transformer.
+        # Encoder (usually set to True, False if part of an encoder-decoder
+        # architecture and in encoder-only stage).
+        if self.add_encoder:
+            self.encoder = ParallelTransformer(
+                self.init_method,
+                output_layer_init_method,
+                self_attn_mask_type=self.encoder_attn_mask_type,
+                pre_process=self.pre_process,
+                post_process=self.post_process
+            )
+            self._encoder_key = 'encoder'
+        else:
+            self.encoder = None
+        # Decoder (usually set to False, True if part of an encoder-decoder
+        # architecture and in decoder-only stage).
+        if self.add_decoder:
+            self.decoder = ParallelTransformer(
+                self.init_method,
+                output_layer_init_method,
+                layer_type=LayerType.decoder,
+                self_attn_mask_type=self.decoder_attn_mask_type,
+                pre_process=self.pre_process,
+                post_process=self.post_process)
+            self._decoder_key = 'decoder'
+        else:
+            self.decoder = None
+        if self.post_process:
+            # Pooler.
+            if self.add_pooler:
+                self.pooler = Pooler(self.hidden_size, self.init_method)
+                self._pooler_key = 'pooler'
+    def set_input_tensor(self, input_tensor):
+        """ See megatron.model.transformer.set_input_tensor()"""
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+        if self.add_encoder and self.add_decoder:
+            assert len(input_tensor) == 1, \
+                'input_tensor should only be length 1 for stage with both encoder and decoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_encoder:
+            assert len(input_tensor) == 1, \
+                'input_tensor should only be length 1 for stage with only encoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_decoder:
+            if len(input_tensor) == 2:
+                self.decoder.set_input_tensor(input_tensor[0])
+                self.encoder_hidden_state = input_tensor[1]
+            elif len(input_tensor) == 1:
+                self.decoder.set_input_tensor(None)
+                self.encoder_hidden_state = input_tensor[0]
+            else:
+                raise Exception('input_tensor must have either length 1 or 2')
+        else:
+            raise Exception('Stage must have at least either encoder or decoder')
+    def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
+                dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
+                enc_dec_attn_mask=None, tokentype_ids=None,
+                inference_params=None,
+                pooling_sequence_index=0,
+                enc_hidden_states=None, output_enc_hidden=False):
+        # Encoder embedding.
+        if self.pre_process:
+            encoder_input = self.embedding(enc_input_ids, enc_position_ids,
+                                           tokentype_ids=tokentype_ids)
+        else:
+            encoder_input = None
+        # Run encoder.
+        if enc_hidden_states is None:
+            if self.encoder is not None:
+                encoder_output = self.encoder(
+                    encoder_input,
+                    enc_attn_mask,
+                    inference_params=inference_params)
+            else:
+                encoder_output = self.encoder_hidden_state
+        else:
+            encoder_output = enc_hidden_states.to(encoder_input.dtype)
+        if self.post_process:
+            if self.add_pooler:
+                pooled_output = self.pooler(encoder_output,
+                                            pooling_sequence_index)
+        # output_enc_hidden refers to when we just need the encoder's
+        # output. For example, it is helpful to compute
+        # similarity between two sequences by average pooling
+        if not self.add_decoder or output_enc_hidden:
+            if self.add_pooler and self.post_process:
+                return encoder_output, pooled_output
+            else:
+                return encoder_output
+        # Decoder embedding.
+        if self.pre_process:
+            decoder_input = self.embedding(dec_input_ids,
+                                           dec_position_ids)
+        else:
+            decoder_input = None
+        # Run decoder.
+        decoder_output = self.decoder(
+            decoder_input,
+            dec_attn_mask,
+            encoder_output=encoder_output,
+            enc_dec_attn_mask=enc_dec_attn_mask,
+            inference_params=inference_params)
+        if self.add_pooler and self.post_process:
+            return decoder_output, encoder_output, pooled_output
+        else:
+            return decoder_output, encoder_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+        state_dict_ = {}
+        if self.pre_process:
+            state_dict_[self._embedding_key] \
+                = self.embedding.state_dict_for_save_checkpoint(
+                    destination, prefix, keep_vars)
+        if self.add_encoder:
+            state_dict_[self._encoder_key] \
+                = self.encoder.state_dict_for_save_checkpoint(
+                    destination, prefix, keep_vars)
+        if self.post_process:
+            if self.add_pooler:
+                state_dict_[self._pooler_key] \
+                    = self.pooler.state_dict_for_save_checkpoint(
+                        destination, prefix, keep_vars)
+        if self.add_decoder:
+            state_dict_[self._decoder_key] \
+                = self.decoder.state_dict_for_save_checkpoint(
+                    destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        # Embedding.
+        if self.pre_process:
+            if self._embedding_key in state_dict:
+                state_dict_ = state_dict[self._embedding_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if '_embeddings' in key:
+                        state_dict_[key] = state_dict[key]
+            self.embedding.load_state_dict(state_dict_, strict=strict)
+        # Encoder.
+        if self.add_encoder:
+            if self._encoder_key in state_dict:
+                state_dict_ = state_dict[self._encoder_key]
+            # For backward compatibility.
+            elif 'transformer' in state_dict:
+                state_dict_ = state_dict['transformer']
+            else:
+                # For backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if 'transformer.' in key:
+                        state_dict_[key.split('transformer.')[1]] = state_dict[key]
+            # For backward compatibility.
+            state_dict_self_attention = {}
+            for key in state_dict_.keys():
+                if '.attention.' in key:
+                    state_dict_self_attention[key.replace(".attention.",
+                        ".self_attention.")] = state_dict_[key]
+                else:
+                    state_dict_self_attention[key] = state_dict_[key]
+            state_dict_ = state_dict_self_attention
+            self.encoder.load_state_dict(state_dict_, strict=strict)
+        # Pooler.
+        if self.post_process:
+            if self.add_pooler:
+                assert 'pooler' in state_dict, \
+                    'could not find data for pooler in the checkpoint'
+                self.pooler.load_state_dict(state_dict[self._pooler_key],
+                                            strict=strict)
+        # Decoder.
+        if self.add_decoder:
+            assert 'decoder' in state_dict, \
+                'could not find data for pooler in the checkpoint'
+            self.decoder.load_state_dict(state_dict[self._decoder_key],
+                                         strict=strict)
--- a/3rdparty/Megatron-LM/megatron/model/module.py
+++ b/3rdparty/Megatron-LM/megatron/model/module.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron Module"""
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+from megatron import get_args
+from megatron import mpu
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+class MegatronModule(torch.nn.Module):
+    """Megatron specific extensions of torch Module with support
+    for pipelining."""
+    def __init__(self, share_word_embeddings=True):
+        super(MegatronModule, self).__init__()
+        self.share_word_embeddings = share_word_embeddings
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """Use this function to override the state dict for
+        saving checkpoints."""
+        return self.state_dict(destination, prefix, keep_vars)
+    def word_embeddings_weight(self):
+        if self.pre_process:
+            return self.language_model.embedding.word_embeddings.weight
+        else:
+            if not self.share_word_embeddings:
+                raise Exception('word_embeddings_weight() called for last '
+                                'stage, but share_word_embeddings is false')
+            return self.word_embeddings.weight
+    def initialize_word_embeddings(self, init_method_normal):
+        args = get_args()
+        if not self.share_word_embeddings:
+            raise Exception('initialize_word_embeddings() was called but '
+                            'share_word_embeddings is false')
+        # This function just initializes the word embeddings in the final stage
+        # when we are using pipeline parallelism. Nothing to do if we aren't
+        # using pipeline parallelism.
+        if args.pipeline_model_parallel_size == 1:
+            return
+        # Parameters are shared between the word embeddings layers, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+        if mpu.is_pipeline_last_stage() and \
+                not self.pre_process:
+            assert not mpu.is_pipeline_first_stage()
+            self._word_embeddings_for_head_key = 'word_embeddings_for_head'
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.word_embeddings = mpu.VocabParallelEmbedding(
+                args.padded_vocab_size, args.hidden_size,
+                init_method=init_method_normal(args.init_method_std))
+            self.word_embeddings.weight.data.fill_(0)
+            self.word_embeddings.weight.shared = True
+        # Zero out initial weights for decoder embedding.
+        # NOTE: We don't currently support T5 with the interleaved schedule.
+        if not mpu.is_pipeline_first_stage(ignore_virtual=True) and \
+                self.pre_process:
+            self.language_model.embedding.zero_parameters()
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if torch.distributed.is_initialized():
+            if mpu.is_rank_in_embedding_group():
+                torch.distributed.all_reduce(self.word_embeddings_weight().data,
+                                             group=mpu.get_embedding_group())
+            # Ensure that encoder(first stage) and decoder(split stage) position 
+            # embeddings have the same initial parameter values
+            # NOTE: We don't currently support T5 with the interleaved schedule.
+            if mpu.is_rank_in_position_embedding_group() and \
+                    args.pipeline_model_parallel_split_rank is not None:
+                # TODO: Support tokentype embedding.
+                self.language_model.embedding.cuda()
+                position_embeddings = self.language_model.embedding.position_embeddings
+                torch.distributed.all_reduce(position_embeddings.weight.data,
+                                             group=mpu.get_position_embedding_group())
+        else:
+            print("WARNING! Distributed processes aren't initialized, so "
+                  "word embeddings in the last layer are not initialized. "
+                  "If you are just manipulating a model this is fine, but "
+                  "this needs to be handled manually. If you are training "
+                  "something is definitely wrong.")
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+def fp32_to_float16(val, float16_convertor):
+    """Convert fp32 `val` to fp16/bf16"""
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = float16_convertor(val)
+        return val
+    return conversion_helper(val, half_conversion)
+def float16_to_fp32(val):
+    """Convert fp16/bf16 `val` to fp32"""
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
+            val = val.float()
+        return val
+    return conversion_helper(val, float_conversion)
+class Float16Module(MegatronModule):
+    def __init__(self, module, args):
+        super(Float16Module, self).__init__()
+        if args.fp16:
+            self.add_module('module', module.half())
+            def float16_convertor(val):
+                return val.half()
+        elif args.bf16:
+            self.add_module('module', module.bfloat16())
+            def float16_convertor(val):
+                return val.bfloat16()
+        else:
+            raise Exception('should not be here')
+        self.float16_convertor = float16_convertor
+    def set_input_tensor(self, input_tensor):
+        return self.module.set_input_tensor(input_tensor)
+    def forward(self, *inputs, **kwargs):
+        if mpu.is_pipeline_first_stage():
+            inputs = fp32_to_float16(inputs, self.float16_convertor)
+        outputs = self.module(*inputs, **kwargs)
+        if mpu.is_pipeline_last_stage():
+            outputs = float16_to_fp32(outputs)
+        return outputs
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(destination, prefix,
+                                                          keep_vars)
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
--- a/3rdparty/Megatron-LM/megatron/model/multiple_choice.py
+++ b/3rdparty/Megatron-LM/megatron/model/multiple_choice.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multiple choice model."""
+import torch
+from megatron import get_args, print_rank_last
+from megatron import mpu
+from megatron.model.enums import AttnMaskType
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+class MultipleChoice(MegatronModule):
+    def __init__(self,
+                 num_tokentypes=2,
+                 pre_process=True,
+                 post_process=True):
+        super(MultipleChoice, self).__init__(share_word_embeddings=False)
+        args = get_args()
+        init_method = init_method_normal(args.init_method_std)
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers),
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        # Multi-choice head.
+        if self.post_process:
+            self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
+            self.multichoice_head = get_linear_layer(args.hidden_size, 1,
+                                                     init_method)
+            self._multichoice_head_key = 'multichoice_head'
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+    def forward(self, model_input, attention_mask, tokentype_ids=None):
+        # [batch, choices, sequence] --> [batch * choices, sequence] -->
+        #    transformer --> [batch, choices] --> softmax
+        # Ensure the shape is [batch-size, choices, sequence]
+        assert len(attention_mask.shape) == 3
+        num_choices = attention_mask.shape[1]
+        # Reshape and treat choice dimension the same as batch.
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        input_ids = model_input
+        # Do the same as attention_mask for input_ids, tokentype_ids
+        assert len(input_ids.shape) == 3
+        assert len(tokentype_ids.shape) == 3
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1))
+        position_ids = bert_position_ids(input_ids)
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids
+        )
+        if self.post_process:
+            _, pooled_output = lm_output
+            multichoice_output = self.multichoice_dropout(pooled_output)
+            multichoice_logits = self.multichoice_head(multichoice_output)
+            # Reshape back to separate choices.
+            multichoice_logits = multichoice_logits.view(-1, num_choices)
+            return multichoice_logits
+        return lm_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.post_process:
+            state_dict_[self._multichoice_head_key] \
+                = self.multichoice_head.state_dict(
+                    destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process:
+            if self._multichoice_head_key in state_dict:
+                self.multichoice_head.load_state_dict(
+                    state_dict[self._multichoice_head_key], strict=strict)
+            else:
+                print_rank_last('***WARNING*** could not find {} in the checkpoint, '
+                                'initializing to random'.format(
+                                    self._multichoice_head_key))
--- a/3rdparty/Megatron-LM/megatron/model/realm_model.py
+++ b/3rdparty/Megatron-LM/megatron/model/realm_model.py
+import os
+import torch
+from megatron import get_args, print_rank_0
+from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
+from megatron.model import BertModel
+from .module import MegatronModule
+from megatron import mpu
+from megatron.model.enums import AttnMaskType
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import scaled_init_method_normal
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+def general_ict_model_provider(only_query_model=False, only_block_model=False):
+    """Build the model."""
+    args = get_args()
+    assert args.ict_head_size is not None, \
+        "Need to specify --ict-head-size to provide an ICTBertModel"
+    assert mpu.get_tensor_model_parallel_world_size() == 1 and mpu.get_pipeline_model_parallel_world_size() == 1, \
+        "Model parallel size > 1 not supported for ICT"
+    print_rank_0('building ICTBertModel...')
+    # simpler to just keep using 2 tokentypes since the LM we initialize with has 2 tokentypes
+    model = ICTBertModel(
+        ict_head_size=args.ict_head_size,
+        num_tokentypes=2,
+        parallel_output=True,
+        only_query_model=only_query_model,
+        only_block_model=only_block_model)
+    return model
+class ICTBertModel(MegatronModule):
+    """Bert-based module for Inverse Cloze task."""
+    def __init__(self,
+                 ict_head_size,
+                 num_tokentypes=1,
+                 parallel_output=True,
+                 only_query_model=False,
+                 only_block_model=False):
+        super(ICTBertModel, self).__init__()
+        bert_kwargs = dict(
+            ict_head_size=ict_head_size,
+            num_tokentypes=num_tokentypes,
+            parallel_output=parallel_output
+        )
+        assert not (only_block_model and only_query_model)
+        self.use_block_model = not only_query_model
+        self.use_query_model = not only_block_model
+        if self.use_query_model:
+            # this model embeds (pseudo-)queries - Embed_input in the paper
+            self.query_model = IREncoderBertModel(**bert_kwargs)
+            self._query_key = 'question_model'
+        if self.use_block_model:
+            # this model embeds evidence blocks - Embed_doc in the paper
+            self.block_model = IREncoderBertModel(**bert_kwargs)
+            self._block_key = 'context_model'
+    def forward(self, query_tokens, query_attention_mask, block_tokens, block_attention_mask):
+        """Run a forward pass for each of the models and return the respective embeddings."""
+        query_logits = self.embed_query(query_tokens, query_attention_mask)
+        block_logits = self.embed_block(block_tokens, block_attention_mask)
+        return query_logits, block_logits
+    def embed_query(self, query_tokens, query_attention_mask):
+        """Embed a batch of tokens using the query model"""
+        if self.use_query_model:
+            query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0)
+            query_ict_logits, _ = self.query_model.forward(query_tokens, query_attention_mask, query_types)
+            return query_ict_logits
+        else:
+            raise ValueError("Cannot embed query without query model.")
+    def embed_block(self, block_tokens, block_attention_mask):
+        """Embed a batch of tokens using the block model"""
+        if self.use_block_model:
+            block_types = torch.cuda.LongTensor(*block_tokens.shape).fill_(0)
+            block_ict_logits, _ = self.block_model.forward(block_tokens, block_attention_mask, block_types)
+            return block_ict_logits
+        else:
+            raise ValueError("Cannot embed block without block model.")
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """Save dict with state dicts of each of the models."""
+        state_dict_ = {}
+        if self.use_query_model:
+            state_dict_[self._query_key] \
+                = self.query_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.use_block_model:
+            state_dict_[self._block_key] \
+                = self.block_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Load the state dicts of each of the models"""
+        if self.use_query_model:
+            print("Loading ICT query model", flush=True)
+            self.query_model.load_state_dict(
+                state_dict[self._query_key], strict=strict)
+        if self.use_block_model:
+            print("Loading ICT block model", flush=True)
+            self.block_model.load_state_dict(
+                state_dict[self._block_key], strict=strict)
+    def init_state_dict_from_bert(self):
+        """Initialize the state from a pretrained BERT model on iteration zero of ICT pretraining"""
+        args = get_args()
+        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
+        if not os.path.isfile(tracker_filename):
+            raise FileNotFoundError("Could not find BERT load for ICT")
+        with open(tracker_filename, 'r') as f:
+            iteration = int(f.read().strip())
+            assert iteration > 0
+        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except BaseException:
+            raise ValueError("Could not load checkpoint")
+        # load the LM state dict into each model
+        model_dict = state_dict['model']['language_model']
+        self.query_model.language_model.load_state_dict(model_dict)
+        self.block_model.language_model.load_state_dict(model_dict)
+        # give each model the same ict_head to begin with as well
+        query_ict_head_state_dict = self.state_dict_for_save_checkpoint()[self._query_key]['ict_head']
+        self.block_model.ict_head.load_state_dict(query_ict_head_state_dict)
+class IREncoderBertModel(MegatronModule):
+    """BERT-based encoder for queries or blocks used for learned information retrieval."""
+    def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True):
+        super(IREncoderBertModel, self).__init__()
+        args = get_args()
+        self.ict_head_size = ict_head_size
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+        self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method)
+        self._ict_head_key = 'ict_head'
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+        lm_output, pooled_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids)
+        # Output.
+        ict_logits = self.ict_head(pooled_output)
+        return ict_logits, None
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        state_dict_[self._ict_head_key] \
+            = self.ict_head.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        self.ict_head.load_state_dict(
+            state_dict[self._ict_head_key], strict=strict)
--- a/3rdparty/Megatron-LM/megatron/model/t5_model.py
+++ b/3rdparty/Megatron-LM/megatron/model/t5_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""T5 model."""
+import torch
+from megatron import (
+    get_args,
+    mpu
+)
+from megatron.model.enums import AttnMaskType
+from megatron.model.language_model import parallel_lm_logits, get_language_model
+from megatron.model.transformer import LayerNorm
+from megatron.model.utils import (
+    openai_gelu,
+    get_linear_layer,
+    init_method_normal,
+    scaled_init_method_normal
+)
+from .module import MegatronModule
+def t5_extended_attention_mask(attention_mask_list):
+    def attn_mask_postprocess(attn_mask):
+        # [b, 1, s, s]
+        extended_attention_mask = attn_mask.unsqueeze(1)
+        return extended_attention_mask
+    return [attn_mask_postprocess(attn_mask) for attn_mask in attention_mask_list]
+def t5_position_ids(token_ids):
+    # Create position ids
+    seq_length = token_ids.size(1)
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=token_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+    return position_ids
+class T5LMHead(MegatronModule):
+    """Masked LM head for T5
+    Arguments:
+        mpu_vocab_size: model parallel size of vocabulary.
+        hidden_size: hidden size
+        init_method: init method for weight initialization
+        layernorm_epsilon: tolerance for layer norm divisions
+        parallel_output: wether output logits being distributed or not.
+    """
+    def __init__(self, mpu_vocab_size, parallel_output):
+        super(T5LMHead, self).__init__()
+        args = get_args()
+        self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
+        self.bias.model_parallel = True
+        self.bias.partition_dim = 0
+        self.bias.stride = 1
+        self.parallel_output = parallel_output
+    def forward(self, hidden_states, word_embeddings_weight):
+        output = parallel_lm_logits(hidden_states,
+                                    word_embeddings_weight,
+                                    self.parallel_output,
+                                    bias=self.bias)
+        return output
+class T5Model(MegatronModule):
+    """T5 Language model."""
+    def __init__(self,
+                 num_tokentypes=0,
+                 parallel_output=True,
+                 pre_process=True,
+                 post_process=True,
+                 add_encoder=True,
+                 add_decoder=True):
+        super(T5Model, self).__init__()
+        args = get_args()
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.add_encoder = add_encoder
+        self.add_decoder = add_decoder
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            add_encoder=add_encoder,
+            add_decoder=add_decoder,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        self.initialize_word_embeddings(init_method_normal)
+        if self.post_process and self.add_decoder:
+            self.lm_head = T5LMHead(
+                self.word_embeddings_weight().size(0),
+                parallel_output)
+            self._lm_head_key = 'lm_head'
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+    def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_mask,
+                decoder_attn_mask, encoder_decoder_attn_mask,
+                tokentype_ids=None, lm_labels=None, enc_hidden_states=None):
+        # Converting the attention masks to proper parameter settings
+        encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask = t5_extended_attention_mask(
+            [encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask])
+        encoder_position_ids = t5_position_ids(encoder_input_ids)
+        decoder_position_ids = t5_position_ids(decoder_input_ids)
+        lm_output = self.language_model(encoder_input_ids,
+                                        encoder_position_ids,
+                                        encoder_attn_mask,
+                                        decoder_input_ids,
+                                        decoder_position_ids,
+                                        decoder_attn_mask,
+                                        encoder_decoder_attn_mask,
+                                        tokentype_ids=tokentype_ids,
+                                        enc_hidden_states=enc_hidden_states)
+        if self.post_process and self.add_decoder:
+            decoder_output, encoder_output = lm_output
+            # Output.
+            lm_logits = self.lm_head(decoder_output,
+                                     self.word_embeddings_weight())
+            if lm_labels is None:
+                return lm_logits
+            else:
+                if self.fp16_lm_cross_entropy:
+                    assert lm_logits.dtype == torch.half
+                    lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
+                else:
+                    lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
+                                                               lm_labels)
+            return lm_loss
+        elif self.add_decoder and not self.add_encoder:
+            decoder_output, encoder_output = lm_output
+            return decoder_output
+        else:
+            encoder_output = lm_output
+            return encoder_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        if self.post_process and self.add_decoder:
+            state_dict_[self._lm_head_key] \
+                = self.lm_head.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+         # Save word_embeddings.
+        if self.post_process and not self.pre_process and self.add_decoder:
+            state_dict_[self._word_embeddings_for_head_key] \
+                = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process and self.add_decoder:
+            self.lm_head.load_state_dict(state_dict[self._lm_head_key],
+                                         strict=strict)
+        # Load word embeddings.
+        if self.post_process and not self.pre_process and self.add_decoder:
+            self.word_embeddings.load_state_dict(
+                state_dict[self._word_embeddings_for_head_key], strict=strict)
--- a/3rdparty/Megatron-LM/megatron/model/transformer.py
+++ b/3rdparty/Megatron-LM/megatron/model/transformer.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer."""
+import math
+import torch
+import torch.nn.functional as F
+from megatron import get_args
+from megatron import mpu
+from .module import MegatronModule
+from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType
+from megatron.model import LayerNorm
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.model.fused_bias_gelu import bias_gelu_impl
+from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
+""" We use the following notation throughout this file:
+     h: hidden size
+     n: number of attention heads
+     p: number of model parallel partitions
+     np: n/p
+     hp: h/p
+     hn: h/n
+     b: batch size
+     s: sequence length
+     l: number of layers
+    Transformer takes input of size [s, b, h] and returns a
+    tensor of the same size. We use the following arguments:
+        hyperparameters: transformer hyperparameters
+"""
+class ParallelMLP(MegatronModule):
+    """MLP.
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+    def __init__(self, init_method, output_layer_init_method):
+        super(ParallelMLP, self).__init__()
+        args = get_args()
+        # Project to 4h.
+        self.dense_h_to_4h = mpu.ColumnParallelLinear(
+            args.hidden_size,
+            args.ffn_hidden_size,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True)
+        self.bias_gelu_fusion = args.bias_gelu_fusion
+        self.activation_func = F.gelu
+        if args.openai_gelu:
+            self.activation_func = openai_gelu
+        elif args.onnx_safe:
+            self.activation_func = erf_gelu
+        # Project back to h.
+        self.dense_4h_to_h = mpu.RowParallelLinear(
+            args.ffn_hidden_size,
+            args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
+        if self.bias_gelu_fusion:
+             intermediate_parallel = \
+                     bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            intermediate_parallel = \
+                self.activation_func(intermediate_parallel + bias_parallel)
+        # [s, b, h]
+        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        return output, output_bias
+class ParallelAttention(MegatronModule):
+    """Parallel self-attention layer abstract class.
+    Self-attention layer takes input with size [b, s, h]
+    and returns output of the same size.
+    """
+    def __init__(self, init_method,
+                 output_layer_init_method, layer_number,
+                 attention_type=AttnType.self_attn,
+                 attn_mask_type=AttnMaskType.padding):
+        super(ParallelAttention, self).__init__()
+        args = get_args()
+        self.fp16 = args.fp16
+        self.bf16 = args.bf16
+        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = args.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.attention_type = attention_type
+        self.attn_mask_type = attn_mask_type
+        self.params_dtype = args.params_dtype
+        projection_size = args.kv_channels * args.num_attention_heads
+        # Per attention head and per partition values.
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(projection_size,
+                                                    world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            projection_size, args.num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
+            args.num_attention_heads, world_size)
+        # Strided linear layer.
+        if attention_type == AttnType.self_attn:
+            self.query_key_value = mpu.ColumnParallelLinear(
+                args.hidden_size,
+                3 * projection_size,
+                gather_output=False,
+                init_method=init_method)
+        else:
+            assert attention_type == AttnType.cross_attn
+            self.query = mpu.ColumnParallelLinear(
+                args.hidden_size,
+                projection_size,
+                gather_output=False,
+                init_method=init_method)
+            self.key_value = mpu.ColumnParallelLinear(
+                args.hidden_size,
+                2 * projection_size,
+                gather_output=False,
+                init_method=init_method)
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            self.fp16, self.bf16,
+            self.attn_mask_type,
+            args.masked_softmax_fusion,
+            attention_mask_func,
+            self.attention_softmax_in_fp32,
+            coeff)
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
+        # Output.
+        self.dense = mpu.RowParallelLinear(
+            projection_size,
+            args.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+    def _allocate_memory(self, inference_max_sequence_len, batch_size):
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+            dtype=self.params_dtype,
+            device=torch.cuda.current_device())
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, inference_params=None):
+        # hidden_states: [sq, b, h]
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        if inference_params:
+            if self.layer_number not in inference_params.key_value_memory_dict:
+                inf_max_seq_len = inference_params.max_sequence_len
+                inf_max_batch_size = inference_params.max_batch_size
+                inference_key_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size)
+                inference_value_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size)
+                inference_params.key_value_memory_dict[self.layer_number] = (
+                    inference_key_memory, inference_value_memory)
+            else:
+                inference_key_memory, inference_value_memory = \
+                    inference_params.key_value_memory_dict[self.layer_number]
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        if self.attention_type == AttnType.self_attn:
+            # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+            mixed_x_layer, _ = self.query_key_value(hidden_states)
+            # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer,
+             key_layer,
+             value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
+        else:
+            # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
+            mixed_kv_layer, _ = self.key_value(encoder_output)
+            # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+            new_tensor_shape = mixed_kv_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 2 * self.hidden_size_per_attention_head)
+            mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
+            # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+            (key_layer,
+             value_layer) = mpu.split_tensor_along_last_dim(mixed_kv_layer, 2)
+            # Attention head [sq, b, h] --> [sq, b, hp]
+            query_layer, _ = self.query(hidden_states)
+            # [sq, b, hp] --> [sq, b, np, hn]
+            new_tensor_shape = query_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 self.hidden_size_per_attention_head)
+            query_layer = query_layer.view(*new_tensor_shape)
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+        if inference_params:
+            batch_start = inference_params.batch_size_offset
+            batch_end = batch_start + key_layer.size(1)
+            assert batch_end <= inference_key_memory.size(1)
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + key_layer.size(0)
+            assert sequence_end <= inference_key_memory.size(0)
+            # Copy key and values.
+            inference_key_memory[sequence_start:sequence_end,
+                                 batch_start:batch_end, ...] = key_layer
+            inference_value_memory[sequence_start:sequence_end,
+                                   batch_start:batch_end, ...] = value_layer
+            key_layer = inference_key_memory[
+                :sequence_end, batch_start:batch_end, ...]
+            value_layer = inference_value_memory[
+                :sequence_end, batch_start:batch_end, ...]
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1),
+                       query_layer.size(2),
+                       query_layer.size(0),
+                       key_layer.size(0))
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(output_size[2],
+                                       output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
+        # preallocting result tensor: [b * np, sq, sk]
+        matmul_result = torch.empty(
+            output_size[0]*output_size[1],
+            output_size[2],
+            output_size[3],
+            dtype=query_layer.dtype,
+            device=torch.cuda.current_device())
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_result,
+            query_layer.transpose(0, 1),   # [b * np, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0, alpha=(1.0/self.norm_factor))
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        with mpu.get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1),
+                       value_layer.size(2),
+                       query_layer.size(0),
+                       value_layer.size(3))
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(value_layer.size(0),
+                                       output_size[0] * output_size[1], -1)
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        output, bias = self.dense(context_layer)
+        return output, bias
+def bias_dropout_add(x, bias, residual, prob, training):
+    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
+    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    out = residual + out
+    return out
+def get_bias_dropout_add(training):
+    def _bias_dropout_add(x, bias, residual, prob):
+        return bias_dropout_add(x, bias, residual, prob, training)
+    return _bias_dropout_add
+@torch.jit.script
+def bias_dropout_add_fused_train(x: torch.Tensor,
+                                 bias: torch.Tensor,
+                                 residual: torch.Tensor,
+                                 prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, True)
+@torch.jit.script
+def bias_dropout_add_fused_inference(x: torch.Tensor,
+                                     bias: torch.Tensor,
+                                     residual: torch.Tensor,
+                                     prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, False)
+class ParallelTransformerLayer(MegatronModule):
+    """A single transformer layer.
+    Transformer layer takes input with size [b, s, h] and returns an
+    output of the same size.
+    """
+    def __init__(self, init_method, output_layer_init_method,
+                 layer_number, layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.padding):
+        args = get_args()
+        super(ParallelTransformerLayer, self).__init__()
+        self.layer_number = layer_number
+        self.layer_type = layer_type
+        self.apply_residual_connection_post_layernorm \
+            = args.apply_residual_connection_post_layernorm
+        self.bf16 = args.bf16
+        self.fp32_residual_connection = args.fp32_residual_connection
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(
+            args.hidden_size,
+            eps=args.layernorm_epsilon,
+            no_persist_layer_norm=args.no_persist_layer_norm)
+        # Self attention.
+        self.self_attention = ParallelAttention(
+            init_method,
+            output_layer_init_method,
+            layer_number,
+            attention_type=AttnType.self_attn,
+            attn_mask_type=self_attn_mask_type)
+        self.hidden_dropout = args.hidden_dropout
+        self.bias_dropout_fusion = args.bias_dropout_fusion
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNorm(
+            args.hidden_size,
+            eps=args.layernorm_epsilon,
+            no_persist_layer_norm=args.no_persist_layer_norm)
+        if self.layer_type == LayerType.decoder:
+            self.inter_attention = ParallelAttention(
+                init_method,
+                output_layer_init_method,
+                layer_number,
+                attention_type=AttnType.cross_attn)
+            # Layernorm on the attention output.
+            self.post_inter_attention_layernorm = LayerNorm(
+                args.hidden_size,
+                eps=args.layernorm_epsilon,
+                no_persist_layer_norm=args.no_persist_layer_norm)
+        # MLP
+        self.mlp = ParallelMLP(init_method,
+                               output_layer_init_method)
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, enc_dec_attn_mask=None,
+                inference_params=None):
+        # hidden_states: [b, s, h]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, attention_bias = \
+            self.self_attention(
+                layernorm_output,
+                attention_mask,
+                inference_params=inference_params)
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+        # jit scripting for a nn.module (with dropout) is not
+        # trigerring the fusion kernel. For now, we use two
+        # different nn.functional routines to account for varying
+        # dropout semantics during training and inference phases.
+        if self.bias_dropout_fusion:
+            if self.training:
+                bias_dropout_add_func = bias_dropout_add_fused_train
+            else:
+                bias_dropout_add_func = bias_dropout_add_fused_inference
+        else:
+            bias_dropout_add_func = get_bias_dropout_add(self.training)
+        # re-enable torch grad to enable fused optimization.
+        with torch.enable_grad():
+            layernorm_input = bias_dropout_add_func(
+                attention_output,
+                attention_bias.expand_as(residual),
+                residual,
+                self.hidden_dropout)
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        if self.layer_type == LayerType.decoder:
+            attention_output, attention_bias = \
+                self.inter_attention(layernorm_output,
+                                     enc_dec_attn_mask,
+                                     encoder_output=encoder_output)
+            # residual connection
+            if self.apply_residual_connection_post_layernorm:
+                residual = layernorm_output
+            else:
+                residual = layernorm_input
+            # re-enable torch grad to enable fused optimization.
+            with torch.enable_grad():
+                layernorm_input = bias_dropout_add_func(
+                    attention_output,
+                    attention_bias.expand_as(residual),
+                    residual,
+                    self.hidden_dropout)
+            # Layer norm post the decoder attention
+            layernorm_output = self.post_inter_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output, mlp_bias = self.mlp(layernorm_output)
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+        # re-enable torch grad to enable fused optimization.
+        with torch.enable_grad():
+            output = bias_dropout_add_func(
+                mlp_output,
+                mlp_bias.expand_as(residual),
+                residual,
+                self.hidden_dropout)
+        return output
+class ParallelTransformer(MegatronModule):
+    """Transformer class."""
+    def __init__(self, init_method, output_layer_init_method,
+                 layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.padding,
+                 pre_process=True, post_process=True):
+        super(ParallelTransformer, self).__init__()
+        args = get_args()
+        self.bf16 = args.bf16
+        self.fp32_residual_connection = args.fp32_residual_connection
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.input_tensor = None
+        # Store activation checkpoiting flag.
+        self.activations_checkpoint_method = args.activations_checkpoint_method
+        self.activations_checkpoint_num_layers = args.activations_checkpoint_num_layers
+        self.distribute_checkpointed_activations = args.distribute_checkpointed_activations
+        # Number of layers.
+        self.num_layers = mpu.get_num_layers(
+            args, args.model_type == ModelType.encoder_and_decoder)
+        # Transformer layers.
+        def build_layer(layer_number):
+            return ParallelTransformerLayer(
+                init_method,
+                output_layer_init_method,
+                layer_number,
+                layer_type=layer_type,
+                self_attn_mask_type=self_attn_mask_type)
+        if args.virtual_pipeline_model_parallel_size is not None:
+            assert args.num_layers % args.virtual_pipeline_model_parallel_size == 0, \
+                'num_layers_per_stage must be divisible by ' \
+                'virtual_pipeline_model_parallel_size'
+            assert args.model_type != ModelType.encoder_and_decoder
+            # Number of layers in each model chunk is the number of layers in the stage,
+            # divided by the number of model chunks in a stage.
+            self.num_layers = self.num_layers // args.virtual_pipeline_model_parallel_size
+            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0]  [2]  [4]  [6]
+            # Stage 1: [1]  [3]  [5]  [7]
+            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0, 1]  [4, 5]
+            # Stage 1: [2, 3]  [6, 7]
+            offset = mpu.get_virtual_pipeline_model_parallel_rank() * (
+                args.num_layers // args.virtual_pipeline_model_parallel_size) + \
+                (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
+        else:
+            # Each stage gets a contiguous set of layers.
+            if args.model_type == ModelType.encoder_and_decoder and \
+                    mpu.get_pipeline_model_parallel_world_size() > 1:
+                pipeline_rank = mpu.get_pipeline_model_parallel_rank()
+                if layer_type == LayerType.encoder:
+                    offset = pipeline_rank * self.num_layers
+                else:
+                    num_ranks_in_enc = args.pipeline_model_parallel_split_rank
+                    offset = (pipeline_rank - num_ranks_in_enc) * self.num_layers
+            else:
+                offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers
+        self.layers = torch.nn.ModuleList(
+            [build_layer(i + 1 + offset) for i in range(self.num_layers)])
+        if self.post_process:
+            # Final layer norm before output.
+            self.final_layernorm = LayerNorm(
+                args.hidden_size,
+                eps=args.layernorm_epsilon,
+                no_persist_layer_norm=args.no_persist_layer_norm)
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+    def _checkpointed_forward(self, hidden_states, attention_mask,
+                              encoder_output, enc_dec_attn_mask):
+        """Forward method with activation checkpointing."""
+        def custom(start, end):
+            def custom_forward(*inputs):
+                x_ = inputs[0]
+                attention_mask = inputs[1]
+                encoder_output = inputs[2]
+                enc_dec_attn_mask = inputs[3]
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    x_ = layer(x_, attention_mask, encoder_output, enc_dec_attn_mask)
+                return x_
+            return custom_forward
+        if self.activations_checkpoint_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            l = 0
+            while l < self.num_layers:
+                hidden_states = mpu.checkpoint(
+                    custom(l, l + self.activations_checkpoint_num_layers),
+                    self.distribute_checkpointed_activations,
+                    hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
+                l += self.activations_checkpoint_num_layers
+        elif self.activations_checkpoint_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            for l in range(self.num_layers):
+                if l < self.activations_checkpoint_num_layers:
+                    hidden_states = mpu.checkpoint(
+                        custom(l, l + 1),
+                        self.distribute_checkpointed_activations,
+                        hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
+                else:
+                    hidden_states = custom(l, l + 1)(
+                        hidden_states, attention_mask, encoder_output, enc_dec_attn_mask)
+        else:
+            raise ValueError("Invalid activation checkpoint method.")
+        return hidden_states
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, enc_dec_attn_mask=None,
+                inference_params=None):
+        # Checks.
+        if inference_params:
+            assert self.activations_checkpoint_method is None, \
+                'inference does not work with activation checkpointing'
+        if self.pre_process:
+            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+            # If the input flag for fp32 residual connection is set, convert for float.
+            if self.fp32_residual_connection:
+                hidden_states = hidden_states.transpose(0, 1).contiguous().float()
+            # Otherwise, leave it as is.
+            else:
+                hidden_states = hidden_states.transpose(0, 1).contiguous()
+        else:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        # 
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = mpu.make_viewless_tensor(
+            hidden_states,
+            requires_grad = True,
+            keep_graph = True,
+        )
+        # Transpose encoder output.
+        if encoder_output is not None:
+            encoder_output = encoder_output.transpose(0, 1).contiguous()
+        # Forward pass.
+        if self.activations_checkpoint_method is not None:
+            hidden_states = self._checkpointed_forward(hidden_states,
+                                                       attention_mask,
+                                                       encoder_output,
+                                                       enc_dec_attn_mask)
+        else:
+            for index in range(self.num_layers):
+                layer = self._get_layer(index)
+                hidden_states = layer(
+                    hidden_states,
+                    attention_mask,
+                    encoder_output=encoder_output,
+                    enc_dec_attn_mask=enc_dec_attn_mask,
+                    inference_params=inference_params)
+        # Final layer norm.
+        if self.post_process:
+            # Reverting data format change [s b h] --> [b s h].
+            hidden_states = hidden_states.transpose(0, 1).contiguous()
+            output = self.final_layernorm(hidden_states)
+        else:
+            output = hidden_states
+        return output
--- a/3rdparty/Megatron-LM/megatron/model/utils.py
+++ b/3rdparty/Megatron-LM/megatron/model/utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for models."""
+import math
+import torch
+from megatron import get_args
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+    return init_
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+    return init_
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+def get_linear_layer(rows, columns, init_method):
+    """Simple linear layer with weight initialization."""
+    layer = torch.nn.Linear(rows, columns)
+    init_method(layer.weight)
+    with torch.no_grad():
+        layer.bias.zero_()
+    return layer
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                       (1.0 + 0.044715 * x * x)))
+def openai_gelu(x):
+    return gelu_impl(x)
+#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@torch.jit.script
+def erf_gelu(x):
+    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
--- a/3rdparty/Megatron-LM/megatron/model/vision/classification.py
+++ b/3rdparty/Megatron-LM/megatron/model/vision/classification.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Vision Transformer(VIT) model."""
+import torch
+from megatron import get_args
+from megatron.model.utils import get_linear_layer
+from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
+from megatron.model.module import MegatronModule
+class VitClassificationModel(MegatronModule):
+    """Vision Transformer Model."""
+    def __init__(self, num_classes, finetune=False,
+                 pre_process=True, post_process=True):
+        super(VitClassificationModel, self).__init__()
+        args = get_args()
+        self.hidden_size = args.hidden_size
+        self.num_classes = num_classes
+        self.finetune = finetune
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.backbone = VitBackbone(
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            single_token_output=True
+        )
+        if self.post_process:
+            if not self.finetune:
+                self.head = VitMlpHead(self.hidden_size, self.num_classes)
+            else:
+                self.head = get_linear_layer(
+                    self.hidden_size,
+                    self.num_classes,
+                    torch.nn.init.zeros_
+                )
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.backbone.set_input_tensor(input_tensor)
+    def forward(self, input):
+        hidden_states = self.backbone(input)
+        if self.post_process:
+            hidden_states = self.head(hidden_states)
+        return hidden_states
--- a/3rdparty/Megatron-LM/megatron/model/vision/vit_backbone.py
+++ b/3rdparty/Megatron-LM/megatron/model/vision/vit_backbone.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Vision Transformer(VIT) model."""
+import math
+import einops
+import torch
+import apex
+import torch.nn.functional as F
+from megatron import get_args
+from megatron.model import LayerNorm
+from megatron.model.transformer import ParallelTransformer
+from megatron.model.utils import (
+    get_linear_layer,
+    init_method_normal,
+    scaled_init_method_normal,
+)
+from megatron.model.module import MegatronModule
+CLASS_TOKEN_LENGTH = 8
+class VitMlpHead(MegatronModule):
+    """Pooler layer.
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+    def __init__(self, hidden_size, num_classes):
+        super(VitMlpHead, self).__init__()
+        self.dense_in = torch.nn.Linear(hidden_size, hidden_size)
+        self.relu = torch.nn.ReLU()
+        self.dense_out = torch.nn.Linear(hidden_size, num_classes)
+        torch.nn.init.constant_(self.dense_out.bias, -10)
+    def forward(self, hidden_states):
+        # hidden_states: [b, 1, h]
+        # sequence_index: index of the token to pool.
+        dense_in_result = self.dense_in(hidden_states)
+        tanh_result = torch.tanh(dense_in_result)
+        dense_out_result = self.dense_out(tanh_result)
+        return dense_out_result
+def isPerfectSquare(x):
+    if(x >= 0):
+        sr = math.sqrt(x)
+        return (int(sr) * int(sr) == x)
+    return False
+def twod_interpolate_position_embeddings_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    args = get_args()
+    num_patches_per_dim_h = args.img_h // args.patch_dim
+    num_patches_per_dim_w = args.img_w // args.patch_dim
+    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+    hidden_size = args.hidden_size
+    key = prefix + "weight"
+    assert key in state_dict
+    if key in state_dict:
+        input_param = state_dict[key]
+        input_seq_len = input_param.shape[0]
+        assert(isPerfectSquare(input_seq_len) or isPerfectSquare(input_seq_len - CLASS_TOKEN_LENGTH))
+        input_has_class_token = not isPerfectSquare(input_seq_len)
+        num_tok_input = input_seq_len - CLASS_TOKEN_LENGTH if input_has_class_token else input_seq_len
+        num_tok_output = num_patches
+        output_has_class_token = args.class_token_present
+        # update input_param and load it to state_dict[key]
+        if input_has_class_token:
+            input_param_tok = input_param[:CLASS_TOKEN_LENGTH, :]
+            input_param_grid = input_param[CLASS_TOKEN_LENGTH:, :]
+        else:
+            input_param_tok = torch.zeros(CLASS_TOKEN_LENGTH, hidden_size)
+            input_param_grid = input_param
+        assert input_param.shape[1] == hidden_size
+        if num_tok_input != num_tok_output:
+            gs_input = int(math.sqrt(num_tok_input))
+            gs_new = (num_patches_per_dim_h, num_patches_per_dim_w)
+            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
+            input_param_grid = input_param_grid.reshape(
+                (1, -1, gs_input, gs_input)
+            )
+            input_param_grid = input_param_grid.float()
+            scale_factor = (gs_new[0] / gs_input, gs_new[1] / gs_input)
+            input_param_grid = F.interpolate(
+                input_param_grid, scale_factor=scale_factor, mode="bilinear"
+            )
+            input_param_grid = input_param_grid.half()
+            input_param_grid = input_param_grid.reshape((-1, num_tok_output))
+            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
+            assert input_param_grid.shape[1] == hidden_size
+        input_param = input_param_grid
+        assert (
+            input_param.shape[0] == num_tok_output
+            and input_param.shape[1] == hidden_size
+        )
+        if output_has_class_token:
+            input_param = torch.cat((input_param_tok, input_param), dim=0)
+        state_dict[key] = input_param
+class VitBackbone(MegatronModule):
+    """Vision Transformer Model."""
+    def __init__(self,
+                 pre_process=True,
+                 post_process=True,
+                 class_token=True,
+                 single_token_output=False):
+        super(VitBackbone, self).__init__(share_word_embeddings=False)
+        args = get_args()
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        if args.init_method_xavier_uniform:
+            self.init_method = torch.nn.init.xavier_uniform_
+            self.scaled_init_method = torch.nn.init.xavier_uniform_
+        else:
+            self.init_method = init_method_normal(args.init_method_std)
+            self.scaled_init_method = scaled_init_method_normal(
+                args.init_method_std, args.num_layers
+            )
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.class_token = class_token
+        self.hidden_size = args.hidden_size
+        self.patch_dim = args.patch_dim
+        self.img_h = args.img_h
+        self.img_w = args.img_w
+        self.micro_batch_size = args.micro_batch_size
+        self.single_token_output = single_token_output
+        assert self.img_h % self.patch_dim == 0
+        assert self.img_w % self.patch_dim == 0
+        self.num_patches_per_dim_h = self.img_h // self.patch_dim
+        self.num_patches_per_dim_w = self.img_w // self.patch_dim
+        self.num_patches = self.num_patches_per_dim_h * self.num_patches_per_dim_w
+        self.seq_length = self.num_patches + (CLASS_TOKEN_LENGTH if self.class_token else 0)
+        self.flatten_dim = self.patch_dim * self.patch_dim * args.num_channels
+        self.input_tensor = None
+        self.position_ids = None
+        if self.pre_process:
+            # cls_token
+            if self.class_token:
+                self.cls_token = torch.nn.Parameter(
+                    torch.randn(1, CLASS_TOKEN_LENGTH, self.hidden_size)
+                )
+                torch.nn.init.zeros_(self.cls_token)
+            self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
+            # Linear encoder
+            self.linear_encoder = torch.nn.Linear(
+                self.flatten_dim, self.hidden_size
+            )
+            # embedding
+            self.position_embeddings = torch.nn.Embedding(
+                self.seq_length, self.hidden_size
+            )
+            init_method_normal(args.init_method_std)(
+                self.position_embeddings.weight
+            )
+            args.class_token_present = self.class_token
+            self.position_embeddings._register_load_state_dict_pre_hook(
+                twod_interpolate_position_embeddings_hook
+            )
+            self.embedding_dropout = torch.nn.Dropout(args.hidden_dropout)
+        # Transformer
+        self.transformer = ParallelTransformer(
+            self.init_method,
+            self.scaled_init_method,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.transformer.set_input_tensor(input_tensor)
+    def forward(self, input):
+        if self.pre_process:
+            rearranged_input = einops.rearrange(
+                input,
+                "b c (h p1) (w p2) -> b (h w) (p1 p2 c)",
+                p1=self.patch_dim,
+                p2=self.patch_dim,
+            )
+            assert rearranged_input.dtype == torch.half
+            encoder_output = self.linear_encoder(rearranged_input)
+            concatenated_tokens = encoder_output
+            if self.class_token:
+                cls_tokens = self.cls_token.expand(encoder_output.shape[0], -1, -1)
+                concatenated_tokens = torch.cat((cls_tokens, encoder_output), dim=1)
+            token_embeddings = concatenated_tokens + \
+                    self.position_embeddings(self.position_ids[:, :concatenated_tokens.shape[1]])
+            hidden_states = self.embedding_dropout(token_embeddings)
+        else:
+            hidden_states = input
+        hidden_states = self.transformer(hidden_states, None)
+        if self.single_token_output:
+            hidden_states = hidden_states[:,0,:]
+        return hidden_states
--- a/3rdparty/Megatron-LM/megatron/mpu/__init__.py
+++ b/3rdparty/Megatron-LM/megatron/mpu/__init__.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model parallel utility interface."""
+from .cross_entropy import vocab_parallel_cross_entropy
+from .data import broadcast_data
+from .initialize import is_unitialized
+from .initialize import destroy_model_parallel
+from .initialize import get_data_parallel_group
+from .initialize import get_data_parallel_rank
+from .initialize import get_data_parallel_world_size
+from .initialize import get_embedding_group
+from .initialize import get_position_embedding_group
+from .initialize import get_model_parallel_group
+from .initialize import get_tensor_model_parallel_group
+from .initialize import get_pipeline_model_parallel_group
+from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank
+from .initialize import get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank
+from .initialize import is_pipeline_first_stage, is_pipeline_last_stage
+from .initialize import is_rank_in_embedding_group
+from .initialize import is_rank_in_position_embedding_group
+from .initialize import is_pipeline_stage_before_split, is_pipeline_stage_after_split
+from .initialize import is_pipeline_stage_at_split
+from .initialize import get_num_layers
+from .initialize import get_tensor_model_parallel_src_rank
+from .initialize import get_data_parallel_src_rank
+from .initialize import get_pipeline_model_parallel_first_rank
+from .initialize import get_pipeline_model_parallel_last_rank
+from .initialize import get_pipeline_model_parallel_next_rank
+from .initialize import get_pipeline_model_parallel_prev_rank
+from .initialize import get_tensor_model_parallel_world_size, set_tensor_model_parallel_world_size
+from .initialize import get_pipeline_model_parallel_world_size, set_pipeline_model_parallel_world_size
+from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank
+from .initialize import initialize_model_parallel
+from .initialize import model_parallel_is_initialized
+from .layers import ColumnParallelLinear
+from .layers import RowParallelLinear
+from .layers import VocabParallelEmbedding
+from .layers import (set_tensor_model_parallel_attributes,
+                     set_defaults_if_not_set_tensor_model_parallel_attributes,
+                     copy_tensor_model_parallel_attributes)
+from .mappings import copy_to_tensor_model_parallel_region
+from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import reduce_from_tensor_model_parallel_region
+from .mappings import scatter_to_tensor_model_parallel_region
+from .random import checkpoint
+from .random import get_cuda_rng_tracker
+from .random import model_parallel_cuda_manual_seed
+from .random import gather_split_1d_tensor
+from .random import split_tensor_into_1d_equal_chunks
+from .random import make_viewless_tensor
+from .random import assert_viewless_tensor
+from .random import safely_set_viewless_tensor_data
+from .utils import divide
+from .utils import split_tensor_along_last_dim
--- a/3rdparty/Megatron-LM/megatron/mpu/cross_entropy.py
+++ b/3rdparty/Megatron-LM/megatron/mpu/cross_entropy.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from .initialize import get_tensor_model_parallel_group
+from .initialize import get_tensor_model_parallel_rank
+from .initialize import get_tensor_model_parallel_world_size
+from .utils import VocabUtility
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(logits_max,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=get_tensor_model_parallel_group())
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_tensor_model_parallel_rank()
+        world_size = get_tensor_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(
+            partition_vocab_size, rank, world_size)
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
+                                 device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(predicted_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_tensor_model_parallel_group())
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=get_tensor_model_parallel_group())
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+        return loss
+    @staticmethod
+    def backward(ctx, grad_output):
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
+                                 device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (
+            1.0 - target_mask.view(-1).float())
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+        return grad_input, None
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
--- a/3rdparty/Megatron-LM/megatron/mpu/data.py
+++ b/3rdparty/Megatron-LM/megatron/mpu/data.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from .initialize import get_tensor_model_parallel_group
+from .initialize import get_tensor_model_parallel_rank
+from .initialize import get_tensor_model_parallel_src_rank
+_MAX_DATA_DIM = 5
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, '{} has data type {} which '\
+            'is different than {}'.format(key, data[key].dtype, target_dtype)
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+    # Pack the sizes on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(),
+                                group=get_tensor_model_parallel_group())
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+    return key_size, key_numel, total_numel
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
+                                                                          data)
+    # Pack on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat(
+            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(total_numel,
+                                   device=torch.cuda.current_device(),
+                                   dtype=datatype)
+    # Broadcast
+    torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(),
+                                group=get_tensor_model_parallel_group())
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+    return output