v1.0

e4575be9 · huaerkl · e4575be9 · e4575be9 · e4575be9 · e4575be9
Commit e4575be9 authored Aug 04, 2023 by huaerkl
20 changed files
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Learning rate decay functions."""
+import math
+from megatron import print_rank_0, get_args
+class AnnealingLR(object):
+    """Anneals the learning rate."""
+    def __init__(self, optimizer, max_lr, min_lr,
+                 warmup_steps, decay_steps, decay_style,
+                 use_checkpoint_lr_scheduler=True,
+                 override_lr_scheduler=False):
+        args = get_args()
+        # Class values.
+        self.optimizer = optimizer
+        self.max_lr = float(max_lr)
+        self.min_lr = min_lr
+        assert self.min_lr >= 0.0
+        assert self.max_lr >= self.min_lr
+        self.warmup_steps = warmup_steps
+        self.num_steps = 0
+        self.decay_steps = decay_steps
+        assert self.decay_steps > 0
+        assert self.warmup_steps < self.decay_steps
+        self.decay_tokens = args.lr_decay_tokens
+        self.num_tokens = 0
+        self.warmup_tokens = 0
+        self.decay_style = decay_style
+        self.override_lr_scheduler = override_lr_scheduler
+        self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
+        if self.override_lr_scheduler:
+            assert not self.use_checkpoint_lr_scheduler, 'both override and '\
+                'use-checkpoint are set.'
+        # Set the learning rate
+        self.step(0)
+        print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
+    def get_lr(self):
+        """Learning rate decay functions from:
+              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
+        # Use linear warmup for the initial part.
+        if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
+            if self.num_steps == self.warmup_steps and \
+                self.decay_tokens is not None:
+                self.warmup_tokens = self.num_tokens
+            return self.max_lr * float(self.num_steps) / \
+                float(self.warmup_steps)
+        # If the learning rate is constant, just return the initial value.
+        if self.decay_style == 'constant':
+            return self.max_lr
+        if self.decay_tokens is None:
+            # step-based decay
+            # For any steps larger than `self.decay_steps`, use `self.min_lr`.
+            if self.num_steps > self.decay_steps:
+                return self.min_lr
+            # If we are done with the warmup period, use the decay style.
+            num_steps_ = self.num_steps - self.warmup_steps
+            decay_steps_ = self.decay_steps - self.warmup_steps
+            decay_ratio = float(num_steps_) / float(decay_steps_)
+        else:
+            # token-based decay
+            if self.num_tokens > self.decay_tokens:
+                return self.min_lr
+            num_tokens_ = self.num_tokens - self.warmup_tokens
+            decay_tokens_ = self.decay_tokens - self.warmup_tokens
+            decay_ratio = float(num_tokens_) / float(decay_tokens_)
+        assert decay_ratio >= 0.0
+        assert decay_ratio <= 1.0
+        delta_lr = self.max_lr - self.min_lr
+        if self.decay_style == 'linear':
+            coeff = (1.0 - decay_ratio)
+        elif self.decay_style == 'cosine':
+            coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
+        else:
+            raise Exception('{} decay style is not supported.'.format(
+                self.decay_style))
+        return self.min_lr + coeff * delta_lr
+    def step(self, increment, token_num=None):
+        """Set lr for all parameters groups."""
+        if token_num is None:
+            args = get_args()
+            token_num = args.consumed_train_tokens
+        self.num_tokens = token_num
+        self.num_steps += increment
+        new_lr = self.get_lr()
+        for group in self.optimizer.param_groups:
+            group['lr'] = new_lr
+    def state_dict(self):
+        state_dict = {
+            'max_lr': self.max_lr,
+            'warmup_steps': self.warmup_steps,
+            'num_steps': self.num_steps,
+            'warmup_tokens': self.warmup_tokens,
+            'num_tokens': self.num_tokens,
+            'decay_style': self.decay_style,
+            'decay_steps': self.decay_steps,
+            'min_lr': self.min_lr
+        }
+        return state_dict
+    def _check_and_set(self, cls_value, sd_value, name):
+        """Auxiliary function for checking the values in the checkpoint and
+        setting them."""
+        if self.override_lr_scheduler:
+            print_rank_0(' > overriding {} value to {}'.format(name, cls_value))
+            return cls_value
+        if not self.use_checkpoint_lr_scheduler:
+            assert cls_value == sd_value, \
+                f'AnnealingLR: class input value {cls_value} and checkpoint' \
+                f'value {sd_value} for {name} do not match'
+        print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
+                                                                  name))
+        return sd_value
+    def load_state_dict(self, sd):
+        if 'start_lr' in sd:
+            max_lr_ = sd['start_lr']
+        else:
+            max_lr_ = sd['max_lr']
+        self.max_lr = self._check_and_set(self.max_lr, max_lr_,
+                                          'learning rate')
+        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
+                                          'minimum learning rate')
+        if 'warmup_iter' in sd:
+            warmup_steps_ = sd['warmup_iter']
+        else:
+            warmup_steps_ = sd['warmup_steps']
+        self.warmup_steps = self._check_and_set(self.warmup_steps,
+                                                warmup_steps_,
+                                                'warmup iterations')
+        if 'end_iter' in sd:
+            decay_steps_ = sd['end_iter']
+        else:
+            decay_steps_ = sd['decay_steps']
+        self.decay_steps = self._check_and_set(self.decay_steps, decay_steps_,
+                                               'total number of iterations')
+        self.decay_style = self._check_and_set(self.decay_style,
+                                               sd['decay_style'],
+                                               'decay style')
+        if 'num_iters' in sd:
+            num_steps = sd['num_iters']
+        else:
+            num_steps = sd['num_steps']
+        if 'warmup_tokens' in sd:
+            self.warmup_tokens = sd['warmup_tokens']
+        if 'num_tokens' in sd:
+            self.num_tokens = sd['num_tokens']
+        self.step(num_steps, self.num_tokens)
--- a/megatron/logging.py
+++ b/megatron/logging.py
+# coding=utf-8
+# Copyright 2020 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Logging utilities. """
+import logging
+import os
+import sys
+import threading
+from functools import wraps
+from logging import CRITICAL  # NOQA
+from logging import DEBUG  # NOQA
+from logging import ERROR  # NOQA
+from logging import FATAL  # NOQA
+from logging import INFO  # NOQA
+from logging import NOTSET  # NOQA
+from logging import WARN  # NOQA
+from logging import WARNING  # NOQA
+from typing import Optional
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+log_levels = {
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+_default_log_level = logging.WARNING
+def _get_default_logging_level():
+    """
+    If MEGATRON_DEEPSPEED_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+    not - fall back to ``_default_log_level``
+    """
+    env_level_str = os.getenv("MEGATRON_DEEPSPEED_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str in log_levels:
+            return log_levels[env_level_str]
+        else:
+            logging.getLogger().warning(
+                f"Unknown option MEGATRON_DEEPSPEED_VERBOSITY={env_level_str}, "
+                f"has to be one of: { ', '.join(log_levels.keys()) }"
+            )
+    return _default_log_level
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+def _get_library_root_logger() -> logging.Logger:
+    return logging.getLogger(_get_library_name())
+def _configure_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        _default_handler.flush = sys.stderr.flush
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        library_root_logger.propagate = False
+def _reset_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if not _default_handler:
+            return
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+def get_log_levels_dict():
+    return log_levels
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+    This function is not supposed to be directly accessed unless you are writing a custom transformers module.
+    """
+    if name is None:
+        name = _get_library_name()
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+def get_verbosity() -> int:
+    """
+    Return the current level for the 🤗 Transformers's root logger as an int.
+    Returns:
+        :obj:`int`: The logging level.
+    .. note::
+        🤗 Transformers has following logging levels:
+        - 50: ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL``
+        - 40: ``transformers.logging.ERROR``
+        - 30: ``transformers.logging.WARNING`` or ``transformers.logging.WARN``
+        - 20: ``transformers.logging.INFO``
+        - 10: ``transformers.logging.DEBUG``
+    """
+    _configure_library_root_logger()
+    return _get_library_root_logger().getEffectiveLevel()
+def set_verbosity(verbosity: int) -> None:
+    """
+    Set the verbosity level for the 🤗 Transformers's root logger.
+    Args:
+        verbosity (:obj:`int`):
+            Logging level, e.g., one of:
+            - ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL``
+            - ``transformers.logging.ERROR``
+            - ``transformers.logging.WARNING`` or ``transformers.logging.WARN``
+            - ``transformers.logging.INFO``
+            - ``transformers.logging.DEBUG``
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().setLevel(verbosity)
+def set_verbosity_info():
+    """Set the verbosity to the :obj:`INFO` level."""
+    return set_verbosity(INFO)
+def set_verbosity_warning():
+    """Set the verbosity to the :obj:`WARNING` level."""
+    return set_verbosity(WARNING)
+def set_verbosity_debug():
+    """Set the verbosity to the :obj:`DEBUG` level."""
+    return set_verbosity(DEBUG)
+def set_verbosity_error():
+    """Set the verbosity to the :obj:`ERROR` level."""
+    return set_verbosity(ERROR)
+def disable_default_handler() -> None:
+    """Disable the default handler of the HuggingFace Transformers's root logger."""
+    _configure_library_root_logger()
+    assert _default_handler is not None
+    _get_library_root_logger().removeHandler(_default_handler)
+def enable_default_handler() -> None:
+    """Enable the default handler of the HuggingFace Transformers's root logger."""
+    _configure_library_root_logger()
+    assert _default_handler is not None
+    _get_library_root_logger().addHandler(_default_handler)
+def add_handler(handler: logging.Handler) -> None:
+    """adds a handler to the HuggingFace Transformers's root logger."""
+    _configure_library_root_logger()
+    assert handler is not None
+    _get_library_root_logger().addHandler(handler)
+def remove_handler(handler: logging.Handler) -> None:
+    """removes given handler from the HuggingFace Transformers's root logger."""
+    _configure_library_root_logger()
+    assert handler is not None and handler not in _get_library_root_logger().handlers
+    _get_library_root_logger().removeHandler(handler)
+def disable_propagation() -> None:
+    """
+    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = False
+def enable_propagation() -> None:
+    """
+    Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
+    prevent double logging if the root logger has been configured.
+    """
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = True
+def enable_explicit_format() -> None:
+    """
+    Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
+    ::
+        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+    for handler in handlers:
+        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        handler.setFormatter(formatter)
+def reset_format() -> None:
+    """
+    Resets the formatting for HuggingFace Transformers's loggers.
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+    for handler in handlers:
+        handler.setFormatter(None)
--- a/megatron/memory.py
+++ b/megatron/memory.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+# A dictionary of all the memory buffers allocated.
+_MEM_BUFFS = dict()
+def allocate_mem_buff(name, numel, dtype, track_usage):
+    """Allocate a memory buffer."""
+    assert name not in _MEM_BUFFS, \
+        'memory buffer {} already allocated.'.format(name)
+    _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
+    return _MEM_BUFFS[name]
+def get_mem_buff(name):
+    """Get the memory buffer."""
+    return _MEM_BUFFS[name]
+class MemoryBuffer:
+    """Contiguous memory buffer.
+    Allocate a contiguous memory of type `dtype` and size `numel`. It is
+    used to reduce memory fragmentation.
+    Usage: After the allocation, the `_start` index is set tot the first
+           index of the memory. A memory chunk starting from `_start` index
+           can be `allocated` for an input tensor, with the elements of the
+           tensor being coppied. The buffer can be reused by resetting the
+           `_start` index.
+    """
+    def __init__(self, name, numel, dtype, track_usage):
+        if torch.distributed.get_rank() == 0:
+            element_size = torch.tensor([], dtype=dtype).element_size()
+            print('> building the {} memory buffer with {} num elements '
+                  'and {} dtype ({:.1f} MB)...'.format(
+                      name, numel, dtype, numel*element_size/1024/1024),
+                  flush=True)
+        self.name = name
+        self.numel = numel
+        self.dtype = dtype
+        self.data = torch.empty(self.numel,
+                                dtype=self.dtype,
+                                device=torch.cuda.current_device(),
+                                requires_grad=False)
+        # Index tracking the start of the free memory.
+        self._start = 0
+        # Values used for tracking usage.
+        self.track_usage = track_usage
+        if self.track_usage:
+            self.in_use_value = 0.0
+            self.total_value = 0.0
+    def reset(self):
+        """Reset the buffer start index to the beginning of the buffer."""
+        self._start = 0
+    def is_in_use(self):
+        """Whether the current buffer hold on to any memory."""
+        return self._start > 0
+    def numel_in_use(self):
+        """Return number of elements in use."""
+        return self._start
+    def add(self, tensor):
+        """Allocate a chunk of memory from the buffer to tensor and copy
+        the values."""
+        assert tensor.dtype == self.dtype, \
+            'Input tensor type {} different from buffer type {}'.format(
+                tensor.dtype, self.dtype)
+        # Number of elements of the input tensor.
+        tensor_numel = torch.numel(tensor)
+        new_start = self._start + tensor_numel
+        assert new_start <= self.numel, \
+            'Not enough memory left in the buffer ({} > {})'.format(
+                tensor_numel, self.numel - self._start)
+        # New tensor is a view into the memory.
+        new_tensor = self.data[self._start:new_start]
+        self._start = new_start
+        new_tensor = new_tensor.view(tensor.shape)
+        new_tensor.copy_(tensor)
+        # Return a pointer to the new tensor.
+        return new_tensor
+    def get_data(self):
+        """Return the data currently in use."""
+        if self.track_usage:
+            self.in_use_value += float(self._start)
+            self.total_value += float(self.numel)
+        return self.data[:self._start]
+    def print_average_usage(self):
+        """Print memory usage average over time. We would like this value
+        to be as high as possible."""
+        assert self.track_usage, 'You need to enable track usage.'
+        if torch.distributed.get_rank() == 0:
+            print(' > usage of {} memory buffer: {:.2f} %'.format(
+                self.name, self.in_use_value * 100.0 / self.total_value),
+                  flush=True)
+class RingMemBuffer:
+    """A ring of memory buffers."""
+    def __init__(self, name, num_buffers, numel, dtype, track_usage):
+        self.num_buffers = num_buffers
+        self.buffers = [
+            allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage)
+            for i in range(num_buffers)]
+        self._index = -1
+    def get_next_buffer(self):
+        self._index += 1
+        self._index = self._index % self.num_buffers
+        buff = self.buffers[self._index]
+        assert not buff.is_in_use(), 'buffer is already in use.'
+        return buff
--- a/megatron/microbatches.py
+++ b/megatron/microbatches.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron number of micro-batches calculators."""
+from abc import ABC
+from abc import abstractmethod
+def build_num_microbatches_calculator(args):
+    # Constant num micro-batches.
+    if args.rampup_batch_size is None:
+        num_microbatches_calculator = ConstantNumMicroBatches(
+            args.global_batch_size, args.micro_batch_size,
+            args.data_parallel_size)
+        if args.rank == 0:
+            print('setting number of micro-batches to constant {}'.format(
+                num_microbatches_calculator.get()), flush=True)
+    else:
+        assert len(args.rampup_batch_size) == 3, 'expected the following ' \
+            'format: --rampup-batch-size <start batch size> ' \
+            '<batch size incerement> <ramp-up samples>'
+        start_batch_size = int(args.rampup_batch_size[0])
+        batch_size_increment = int(args.rampup_batch_size[1])
+        ramup_samples = int(args.rampup_batch_size[2])
+        if args.rank == 0:
+            print('will use batch size rampup starting from global batch '
+                  'size {} to global batch size {} with batch size increments '
+                  '{} over {} samples.'.format(start_batch_size,
+                                               args.global_batch_size,
+                                               batch_size_increment,
+                                               ramup_samples), flush=True)
+        num_microbatches_calculator = RampupBatchsizeNumMicroBatches(
+            start_batch_size, batch_size_increment, ramup_samples,
+            args.global_batch_size, args.micro_batch_size,
+            args.data_parallel_size)
+    return num_microbatches_calculator
+class NumMicroBatchesCalculator(ABC):
+    def __init__(self):
+        self.num_micro_batches = None
+        self.current_global_batch_size = None
+    def get(self):
+        return self.num_micro_batches
+    def get_current_global_batch_size(self):
+        return self.current_global_batch_size
+    @abstractmethod
+    def update(self, consumed_samples, consistency_check):
+        pass
+class ConstantNumMicroBatches(NumMicroBatchesCalculator):
+    def __init__(self, global_batch_size, micro_batch_size, data_parallel_size):
+        micro_batch_times_data_parallel = micro_batch_size * \
+                                          data_parallel_size
+        assert global_batch_size % micro_batch_times_data_parallel == 0, \
+            'global batch size ({}) is not divisible by micro batch size ({})' \
+            ' times data parallel size ({})'.format(global_batch_size,
+                                                    micro_batch_size,
+                                                    data_parallel_size)
+        self.num_micro_batches = global_batch_size // \
+                                 micro_batch_times_data_parallel
+        assert self.num_micro_batches >= 1
+        self.current_global_batch_size = global_batch_size
+    def update(self, consumed_samples, consistency_check):
+        pass
+class RampupBatchsizeNumMicroBatches(NumMicroBatchesCalculator):
+    def __init__(self, start_batch_size, batch_size_increment, ramup_samples,
+                 global_batch_size, micro_batch_size, data_parallel_size):
+        """Batch size ramp up.
+        Over 
+          steps = (global-batch-size - start-batch-size) / batch_size_increment
+        increment batch size from start-batch-size to global-batch-size using
+          rampup-samples / steps
+        samples.
+        Arguments:
+            start_batch_size: global batch size to start with
+            batch_size_increment: global batch size increments
+            ramup_samples: number of samples to use ramp up global
+               batch size from `start_batch_size` to `global_batch_size`
+            global_batch_size: global batch size post rampup
+            micro_batch_size: micro batch size
+            data_parallel_size: data parallel size.
+        """
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_size = data_parallel_size
+        self.micro_batch_times_data_parallel_size = self.micro_batch_size * \
+                                                    self.data_parallel_size
+        assert self.micro_batch_times_data_parallel_size > 0
+        assert start_batch_size > 0
+        self.start_batch_size = start_batch_size
+        assert global_batch_size > 0
+        self.global_batch_size = global_batch_size
+        diff_batch_size = self.global_batch_size - self.start_batch_size
+        assert diff_batch_size >= 0
+        assert batch_size_increment > 0
+        self.batch_size_increment = batch_size_increment
+        assert diff_batch_size % batch_size_increment == 0, 'expected ' \
+            'global batch size interval ({}) to be divisible by global batch ' \
+            'size increment ({})'.format(diff_batch_size, batch_size_increment)
+        num_increments = diff_batch_size // self.batch_size_increment
+        self.ramup_samples = ramup_samples
+        assert self.ramup_samples >= 0
+        self.rampup_samples_per_increment = self.ramup_samples / num_increments
+        # Initialize number of microbatches.
+        self.update(0, False)
+    def update(self, consumed_samples, consistency_check):
+        if consumed_samples > self.ramup_samples:
+            self.current_global_batch_size = self.global_batch_size
+        else:
+            steps = int(consumed_samples / self.rampup_samples_per_increment)
+            self.current_global_batch_size = self.start_batch_size + \
+                steps * self.batch_size_increment
+            assert self.current_global_batch_size <= self.global_batch_size
+        if consistency_check:
+            assert self.current_global_batch_size % \
+                self.micro_batch_times_data_parallel_size == 0, 'current global ' \
+                'batch size ({}) is not divisible by micro-batch-size ({}) ' \
+                'times data parallel size ({})'.format(self.current_global_batch_size,
+                                                 self.micro_batch_size,
+                                                 self.data_parallel_size)
+        self.num_micro_batches = self.current_global_batch_size // \
+                                 self.micro_batch_times_data_parallel_size
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+from .distributed import DistributedDataParallel
+from .bert_model import BertModel
+from .gpt_model import GPTModel, GPTModelPipe
+from .vit_model import VitModel, VitModelPipe
+from .t5_model import T5Model
+from .language_model import get_language_model
+from .module import Float16Module
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT model."""
+import torch
+from megatron import get_args
+from megatron import mpu
+from megatron.enums import AttnMaskType
+from megatron.model.language_model import parallel_lm_logits
+from megatron.model.language_model import get_language_model
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+from megatron.model.utils import openai_gelu, erf_gelu
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+def bert_extended_attention_mask(attention_mask):
+    # We create a 3D attention mask from a 2D tensor mask.
+    # [b, 1, s]
+    attention_mask_b1s = attention_mask.unsqueeze(1)
+    # [b, s, 1]
+    attention_mask_bs1 = attention_mask.unsqueeze(2)
+    # [b, s, s]
+    attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+    # [b, 1, s, s]
+    extended_attention_mask = attention_mask_bss.unsqueeze(1)
+    # Convert attention mask to binary:
+    extended_attention_mask = (extended_attention_mask < 0.5)
+    return extended_attention_mask
+def bert_position_ids(token_ids):
+    # Create position ids
+    seq_length = token_ids.size(1)
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=token_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+    return position_ids
+class BertLMHead(MegatronModule):
+    """Masked LM head for Bert
+    Arguments:
+        mpu_vocab_size: model parallel size of vocabulary.
+        hidden_size: hidden size
+        init_method: init method for weight initialization
+        layernorm_epsilon: tolerance for layer norm divisions
+        parallel_output: whether output logits being distributed or not.
+    """
+    def __init__(self, mpu_vocab_size, hidden_size, init_method,
+                 layernorm_epsilon, parallel_output):
+        super(BertLMHead, self).__init__()
+        args = get_args()
+        self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
+        mpu.set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
+        self.parallel_output = parallel_output
+        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+        self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+        self.gelu = torch.nn.functional.gelu
+        if args.openai_gelu:
+            self.gelu = openai_gelu
+        elif args.onnx_safe:
+            self.gelu = erf_gelu
+    def forward(self, hidden_states, word_embeddings_weight):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        hidden_states = self.layernorm(hidden_states)
+        output = parallel_lm_logits(hidden_states,
+                                    word_embeddings_weight,
+                                    self.parallel_output,
+                                    bias=self.bias)
+        return output
+def post_language_model_processing(lm_output, pooled_output,
+                                   lm_head, binary_head,
+                                   lm_labels,
+                                   logit_weights,
+                                   fp16_lm_cross_entropy):
+    # Output.
+    lm_logits = lm_head(
+        lm_output, logit_weights)
+    binary_logits = None
+    if binary_head is not None:
+        binary_logits = binary_head(pooled_output)
+    if lm_labels is None:
+        return lm_logits, binary_logits
+    else:
+        if fp16_lm_cross_entropy:
+            assert lm_logits.dtype == torch.half
+            lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
+        else:
+            lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
+                                                       lm_labels)
+        return lm_loss, binary_logits
+class BertModel(MegatronModule):
+    """Bert Language model."""
+    def __init__(self,
+                 num_tokentypes=2,
+                 add_binary_head=True,
+                 parallel_output=True,
+                 pre_process=True,
+                 post_process=True):
+        super(BertModel, self).__init__()
+        args = get_args()
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.add_binary_head = add_binary_head
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=self.add_binary_head,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        self.initialize_word_embeddings(init_method_normal)
+        if self.post_process:
+            self.lm_head = BertLMHead(
+                self.word_embeddings_weight().size(0),
+                args.hidden_size, init_method, args.layernorm_epsilon, parallel_output)
+            self._lm_head_key = 'lm_head'
+            self.binary_head = None
+            if self.add_binary_head:
+                self.binary_head = get_linear_layer(args.hidden_size, 2,
+                                                    init_method)
+                self._binary_head_key = 'binary_head'
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+    def forward(self, bert_model_input, attention_mask,
+                tokentype_ids=None, lm_labels=None):
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        input_ids = bert_model_input
+        position_ids = bert_position_ids(input_ids)
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids
+        )
+        if self.post_process and self.add_binary_head:
+            lm_output, pooled_output = lm_output
+        else:
+            pooled_output = None
+        if self.post_process:
+            return post_language_model_processing(lm_output, pooled_output,
+                                                  self.lm_head, self.binary_head,
+                                                  lm_labels,
+                                                  self.word_embeddings_weight(),
+                                                  self.fp16_lm_cross_entropy)
+        else:
+            return lm_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        if self.post_process:
+            state_dict_[self._lm_head_key] \
+                = self.lm_head.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.post_process and self.add_binary_head:
+            state_dict_[self._binary_head_key] \
+                = self.binary_head.state_dict(destination, prefix, keep_vars)
+        # Save word_embeddings.
+        if self.post_process and not self.pre_process:
+            state_dict_[self._word_embeddings_for_head_key] \
+                = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process:
+            self.lm_head.load_state_dict(
+                state_dict[self._lm_head_key], strict=strict)
+        if self.post_process and self.add_binary_head:
+            self.binary_head.load_state_dict(
+                state_dict[self._binary_head_key], strict=strict)
+        # Load word_embeddings.
+        if self.post_process and not self.pre_process:
+            self.word_embeddings.load_state_dict(
+                state_dict[self._word_embeddings_for_head_key], strict=strict)
--- a/megatron/model/biencoder_model.py
+++ b/megatron/model/biencoder_model.py
+import os
+import torch
+import sys
+from megatron import get_args, print_rank_0
+from megatron.checkpointing import fix_query_key_value_ordering
+from megatron.checkpointing import get_checkpoint_tracker_filename
+from megatron.checkpointing import get_checkpoint_name
+from megatron import mpu, get_tokenizer
+from megatron.model.bert_model import bert_position_ids
+from megatron.enums import AttnMaskType
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+def biencoder_model_provider(only_query_model=False,
+                             only_context_model=False,
+                             biencoder_shared_query_context_model=False):
+    """Build the model."""
+    args = get_args()
+    assert mpu.get_tensor_model_parallel_world_size() == 1 and \
+        mpu.get_pipeline_model_parallel_world_size() == 1, \
+        "Model parallel size > 1 not supported for ICT"
+    print_rank_0('building BiEncoderModel...')
+    # simpler to just keep using 2 tokentypes since
+    # the LM we initialize with has 2 tokentypes
+    model = BiEncoderModel(
+        num_tokentypes=2,
+        parallel_output=False,
+        only_query_model=only_query_model,
+        only_context_model=only_context_model,
+        biencoder_shared_query_context_model=\
+            biencoder_shared_query_context_model)
+    return model
+class BiEncoderModel(MegatronModule):
+    """Bert-based module for Biencoder model."""
+    def __init__(self,
+                 num_tokentypes=1,
+                 parallel_output=True,
+                 only_query_model=False,
+                 only_context_model=False,
+                 biencoder_shared_query_context_model=False):
+        super(BiEncoderModel, self).__init__()
+        args = get_args()
+        bert_kwargs = dict(
+            num_tokentypes=num_tokentypes,
+            parallel_output=parallel_output)
+        self.biencoder_shared_query_context_model = \
+            biencoder_shared_query_context_model
+        assert not (only_context_model and only_query_model)
+        self.use_context_model = not only_query_model
+        self.use_query_model = not only_context_model
+        self.biencoder_projection_dim = args.biencoder_projection_dim
+        if self.biencoder_shared_query_context_model:
+            self.model = PretrainedBertModel(**bert_kwargs)
+            self._model_key = 'shared_model'
+            self.query_model, self.context_model = self.model, self.model
+        else:
+            if self.use_query_model:
+                # this model embeds (pseudo-)queries - Embed_input in the paper
+                self.query_model = PretrainedBertModel(**bert_kwargs)
+                self._query_key = 'query_model'
+            if self.use_context_model:
+                # this model embeds evidence blocks - Embed_doc in the paper
+                self.context_model = PretrainedBertModel(**bert_kwargs)
+                self._context_key = 'context_model'
+    def forward(self, query_tokens, query_attention_mask, query_types,
+                context_tokens, context_attention_mask, context_types):
+        """Run a forward pass for each of the models and
+        return the respective embeddings."""
+        if self.use_query_model:
+            query_logits = self.embed_text(self.query_model,
+                                           query_tokens,
+                                           query_attention_mask,
+                                           query_types)
+        else:
+            raise ValueError("Cannot embed query without the query model.")
+        if self.use_context_model:
+            context_logits = self.embed_text(self.context_model,
+                                             context_tokens,
+                                             context_attention_mask,
+                                             context_types)
+        else:
+            raise ValueError("Cannot embed block without the block model.")
+        return query_logits, context_logits
+    @staticmethod
+    def embed_text(model, tokens, attention_mask, token_types):
+        """Embed a batch of tokens using the model"""
+        logits = model(tokens,
+                              attention_mask,
+                              token_types)
+        return logits
+    def state_dict_for_save_checkpoint(self, destination=None, \
+        prefix='', keep_vars=False):
+        """Save dict with state dicts of each of the models."""
+        state_dict_ = {}
+        if self.biencoder_shared_query_context_model:
+            state_dict_[self._model_key] = \
+                self.model.state_dict_for_save_checkpoint(destination,
+                                                          prefix,
+                                                          keep_vars)
+        else:
+            if self.use_query_model:
+                state_dict_[self._query_key] = \
+                    self.query_model.state_dict_for_save_checkpoint(
+                        destination, prefix, keep_vars)
+            if self.use_context_model:
+                state_dict_[self._context_key] = \
+                    self.context_model.state_dict_for_save_checkpoint(
+                        destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Load the state dicts of each of the models"""
+        if self.biencoder_shared_query_context_model:
+            print_rank_0("Loading shared query-context model")
+            self.model.load_state_dict(state_dict[self._model_key], \
+                strict=strict)
+        else:
+            if self.use_query_model:
+                print_rank_0("Loading query model")
+                self.query_model.load_state_dict( \
+                    state_dict[self._query_key], strict=strict)
+            if self.use_context_model:
+                print_rank_0("Loading context model")
+                self.context_model.load_state_dict( \
+                    state_dict[self._context_key], strict=strict)
+    def init_state_dict_from_bert(self):
+        """Initialize the state from a pretrained BERT model
+        on iteration zero of ICT pretraining"""
+        args = get_args()
+        if args.bert_load is None:
+            print_rank_0("bert-load argument is None")
+            return
+        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
+        if not os.path.isfile(tracker_filename):
+            raise FileNotFoundError("Could not find BERT checkpoint")
+        with open(tracker_filename, 'r') as f:
+            iteration = int(f.read().strip())
+            assert iteration > 0
+        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading BERT checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+        # Load the checkpoint.
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except ModuleNotFoundError:
+            from megatron.fp16_deprecated import loss_scaler
+            # For backward compatibility.
+            print_rank_0(' > deserializing using the old code structure ...')
+            sys.modules['fp16.loss_scaler'] = sys.modules[
+                'megatron.fp16_deprecated.loss_scaler']
+            sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
+                'megatron.fp16_deprecated.loss_scaler']
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+            sys.modules.pop('fp16.loss_scaler', None)
+            sys.modules.pop('megatron.fp16.loss_scaler', None)
+        except BaseException:
+            print_rank_0('could not load the BERT checkpoint')
+            sys.exit()
+        checkpoint_version = state_dict.get('checkpoint_version', 0)
+        # load the LM state dict into each model
+        model_dict = state_dict['model']['language_model']
+        if self.biencoder_shared_query_context_model:
+            self.model.language_model.load_state_dict(model_dict)
+            fix_query_key_value_ordering(self.model, checkpoint_version)
+        else:
+            if self.use_query_model:
+                self.query_model.language_model.load_state_dict(model_dict)
+                # give each model the same ict_head to begin with as well
+                if self.biencoder_projection_dim > 0:
+                    query_proj_state_dict = \
+                        self.state_dict_for_save_checkpoint()\
+                        [self._query_key]['projection_enc']
+                fix_query_key_value_ordering(self.query_model, checkpoint_version)
+            if self.use_context_model:
+                self.context_model.language_model.load_state_dict(model_dict)
+                if self.query_model is not None and \
+                    self.biencoder_projection_dim > 0:
+                    self.context_model.projection_enc.load_state_dict\
+                        (query_proj_state_dict)
+                fix_query_key_value_ordering(self.context_model, checkpoint_version)
+class PretrainedBertModel(MegatronModule):
+    """BERT-based encoder for queries or contexts used for
+    learned information retrieval."""
+    def __init__(self, num_tokentypes=2,
+            parallel_output=True):
+        super(PretrainedBertModel, self).__init__()
+        args = get_args()
+        tokenizer = get_tokenizer()
+        self.pad_id = tokenizer.pad
+        self.biencoder_projection_dim = args.biencoder_projection_dim
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(
+            args.init_method_std, args.num_layers)
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+        if args.biencoder_projection_dim > 0:
+            self.projection_enc = get_linear_layer(args.hidden_size,
+                                                   args.biencoder_projection_dim,
+                                                   init_method)
+            self._projection_enc_key = 'projection_enc'
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = attention_mask.unsqueeze(1)
+        #extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        position_ids = bert_position_ids(input_ids)
+        lm_output = self.language_model(input_ids,
+                                        position_ids,
+                                        extended_attention_mask,
+                                        tokentype_ids=tokentype_ids)
+        # This mask will be used in average-pooling and max-pooling
+        pool_mask = (input_ids == self.pad_id).unsqueeze(2)
+        # Taking the representation of the [CLS] token of BERT
+        pooled_output = lm_output[:, 0, :]
+        # Converting to float16 dtype
+        pooled_output = pooled_output.to(lm_output.dtype)
+        # Output.
+        if self.biencoder_projection_dim:
+            pooled_output = self.projection_enc(pooled_output)
+        return pooled_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        if self.biencoder_projection_dim > 0:
+            state_dict_[self._projection_enc_key] = \
+                self.projection_enc.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        print_rank_0("loading BERT weights")
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.biencoder_projection_dim > 0:
+            print_rank_0("loading projection head weights")
+            self.projection_enc.load_state_dict(
+                state_dict[self._projection_enc_key], strict=strict)
--- a/megatron/model/classification.py
+++ b/megatron/model/classification.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classification model."""
+import torch
+from megatron import get_args, print_rank_last
+from megatron import mpu
+from megatron.enums import AttnMaskType
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+class Classification(MegatronModule):
+    def __init__(self,
+                 num_classes,
+                 num_tokentypes=2,
+                 pre_process=True,
+                 post_process=True):
+        super(Classification, self).__init__(share_word_embeddings=False)
+        args = get_args()
+        self.num_classes = num_classes
+        self.pre_process = pre_process
+        self.post_process = post_process
+        init_method = init_method_normal(args.init_method_std)
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers),
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        # Multi-choice head.
+        if self.post_process:
+            self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
+            self.classification_head = get_linear_layer(args.hidden_size,
+                                                        self.num_classes,
+                                                        init_method)
+            self._classification_head_key = 'classification_head'
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+    def forward(self, model_input, attention_mask, tokentype_ids=None):
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        input_ids = model_input
+        position_ids = bert_position_ids(input_ids)
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids
+        )
+        if self.post_process:
+            _, pooled_output = lm_output
+            classification_output = self.classification_dropout(pooled_output)
+            classification_logits = self.classification_head(classification_output)
+            # Reshape back to separate choices.
+            classification_logits = classification_logits.view(-1, self.num_classes)
+            return classification_logits
+        return lm_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.post_process:
+            state_dict_[self._classification_head_key] \
+                = self.classification_head.state_dict(
+                    destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process:
+            if self._classification_head_key in state_dict:
+                self.classification_head.load_state_dict(
+                    state_dict[self._classification_head_key], strict=strict)
+            else:
+                print_rank_last('***WARNING*** could not find {} in the checkpoint, '
+                                'initializing to random'.format(
+                                    self._classification_head_key))
--- a/megatron/model/distributed.py
+++ b/megatron/model/distributed.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC
+from abc import abstractmethod
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from megatron import get_args
+from megatron import mpu
+from .module import MegatronModule
+class MemoryBuffer:
+    def __init__(self, numel, dtype):
+        self.numel = numel
+        self.dtype = dtype
+        self.data = torch.zeros(self.numel,
+                                dtype=self.dtype,
+                                device=torch.cuda.current_device(),
+                                requires_grad=False)
+    def zero(self):
+        """Reset the buffer to zero."""
+        self.data.zero_()
+    def get(self, shape, start_index):
+        """Return a tensor with the input `shape` as a view into the
+        1-D data starting at `start_index`."""
+        end_index = start_index + shape.numel()
+        assert end_index <= self.numel, \
+            'requested tensor is out of the buffer range.'
+        buffer_tensor = self.data[start_index:end_index]
+        buffer_tensor = buffer_tensor.view(shape)
+        return buffer_tensor
+class DistributedDataParallelBase(MegatronModule, ABC):
+    """Abstract class for DDP."""
+    def __init__(self, module):
+        super(DistributedDataParallelBase, self).__init__()
+        # Keep a pointer to the model.
+        self.module = module
+    @abstractmethod
+    def allreduce_gradients(self):
+        pass
+    def forward(self, *inputs, **kwargs):
+        return self.module(*inputs, **kwargs)
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(destination, prefix,
+                                                          keep_vars)
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+class DistributedDataParallel(DistributedDataParallelBase):
+    """DDP with contiguous buffers options to storre and accumulate gradients.
+    This class:
+        - has the potential to reduce memory fragmentation.
+        - provides the option to do the gradient accumulation
+          in a type other than the params type (for example fp32)
+    Arguments:
+        module: input model.
+        accumulate_allreduce_grads_in_fp32: if true do the gradient accumulation
+            and the gradient all-reduce all in in float32. If this option is
+            true, we require `use_contiguous_buffers` to be true too.
+        use_contiguous_buffers: if true, use a contiguous buffer to store the
+            gradients.
+    """
+    def __init__(self, module,
+                 accumulate_allreduce_grads_in_fp32,
+                 use_contiguous_buffers):
+        super(DistributedDataParallel, self).__init__(module)
+        self.accumulate_allreduce_grads_in_fp32 \
+            = accumulate_allreduce_grads_in_fp32
+        self.use_contiguous_buffers = use_contiguous_buffers
+        # If we are using fp32-accumulate-allreduce explicitly
+        # this means we need main grads in a continous buffer.
+        if self.accumulate_allreduce_grads_in_fp32:
+            assert self.use_contiguous_buffers
+        # ===================================
+        # Rest of this part applies only to
+        # the case we use continuous buffers.
+        # ===================================
+        self._grad_buffers = None
+        if self.use_contiguous_buffers:
+            self._grad_buffers = {}
+            # Simple function to define buffer type.
+            def _get_buffer_type(param):
+                return torch.float if \
+                    self.accumulate_allreduce_grads_in_fp32 else param.dtype
+            # First calculate total number of elements per type.
+            type_num_elements = {}
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    dtype = _get_buffer_type(param)
+                    type_num_elements[dtype] = type_num_elements.get(dtype, 0) \
+                                               + param.data.nelement()
+            # Allocate the buffer.
+            for dtype, num_elements in type_num_elements.items():
+                self._grad_buffers[dtype] = MemoryBuffer(num_elements, dtype)
+            # Assume the back prop order is reverse the params order,
+            # store the start index for the gradients.
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    dtype = _get_buffer_type(param)
+                    type_num_elements[dtype] -= param.data.nelement()
+                    param.main_grad = self._grad_buffers[dtype].get(
+                        param.data.shape, type_num_elements[dtype])
+            # Backward hook.
+            # Accumalation function for the gradients. We need
+            # to store them so they don't go out of scope.
+            self.grad_accs = []
+            # Loop over all the parameters in the model.
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    # Expand so we get access to grad_fn.
+                    param_tmp = param.expand_as(param)
+                    # Get the gradient accumulator functtion.
+                    grad_acc = param_tmp.grad_fn.next_functions[0][0]
+                    grad_acc.register_hook(self._make_param_hook(param))
+                    self.grad_accs.append(grad_acc)
+    def _make_param_hook(self, param):
+        """Create the all-reduce hook for backprop."""
+        # Hook used for back-prop.
+        def param_hook(*unused):
+            # Add the gradient to the buffer.
+            if param.grad.data is not None:
+                param.main_grad.add_(param.grad.data)
+                # Now we can deallocate grad memory.
+                param.grad = None
+        return param_hook
+    def zero_grad_buffer(self):
+        """Set the grad buffer data to zero. Needs to be called at the
+        begining of each iteration."""
+        assert self._grad_buffers is not None, 'buffers are not initialized.'
+        for _, buffer_ in self._grad_buffers.items():
+            buffer_.zero()
+    def allreduce_gradients(self):
+        """Reduce gradients across data parallel ranks."""
+        # If we have buffers, simply reduce the data in the buffer.
+        if self._grad_buffers is not None:
+            for _, buffer_ in self._grad_buffers.items():
+                buffer_.data /= mpu.get_data_parallel_world_size()
+                torch.distributed.all_reduce(
+                    buffer_.data, group=mpu.get_data_parallel_group())
+        else:
+            # Otherwise, bucketize and all-reduce
+            buckets = {}
+            # Pack the buckets.
+            for param in self.module.parameters():
+                if param.requires_grad and param.grad is not None:
+                    tp = param.data.type()
+                    if tp not in buckets:
+                        buckets[tp] = []
+                    buckets[tp].append(param)
+                    param.main_grad = param.grad
+            # For each bucket, all-reduce and copy all-reduced grads.
+            for tp in buckets:
+                bucket = buckets[tp]
+                grads = [param.grad.data for param in bucket]
+                coalesced = _flatten_dense_tensors(grads)
+                coalesced /= mpu.get_data_parallel_world_size()
+                torch.distributed.all_reduce(
+                    coalesced, group=mpu.get_data_parallel_group())
+                for buf, synced in zip(grads, _unflatten_dense_tensors(
+                        coalesced, grads)):
+                    buf.copy_(synced)
--- a/megatron/model/fused_bias_gelu.py
+++ b/megatron/model/fused_bias_gelu.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff*g
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+bias_gelu_impl = GeLUFunction.apply
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This code is copied fron NVIDIA apex:
+      https://github.com/NVIDIA/apex
+   with some changes. """
+import numbers
+from megatron import get_args
+from megatron import mpu
+from packaging import version
+from torch import nn
+from torch.nn import init
+from torch.nn.parameter import Parameter
+import importlib
+import torch
+import torch.nn.functional as F
+global fused_mix_prec_layer_norm_cuda
+fused_mix_prec_layer_norm_cuda = None
+class FusedLayerNormAffineFunction(torch.autograd.Function):
+  @staticmethod
+  def forward(ctx, input, weight, bias, normalized_shape, eps):
+    ctx.normalized_shape = normalized_shape
+    ctx.eps = eps
+    input_ = input.contiguous()
+    weight_ = weight.contiguous()
+    bias_ = bias.contiguous()
+    output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
+        input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
+    ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
+    return output
+  @staticmethod
+  def backward(ctx, grad_output):
+    input_, weight_, bias_, mean, invvar = ctx.saved_tensors
+    grad_input = grad_weight = grad_bias = None
+    grad_input, grad_weight, grad_bias \
+      = fused_mix_prec_layer_norm_cuda.backward_affine(
+        grad_output.contiguous(), mean, invvar,
+        input_, ctx.normalized_shape,
+        weight_, bias_, ctx.eps)
+    return grad_input, grad_weight, grad_bias, None, None
+class MixedFusedLayerNorm(torch.nn.Module):
+  def __init__(self, normalized_shape, eps=1e-5):
+    super(MixedFusedLayerNorm, self).__init__()
+    global fused_mix_prec_layer_norm_cuda
+    fused_mix_prec_layer_norm_cuda = importlib.import_module(
+      "fused_mix_prec_layer_norm_cuda")
+    if isinstance(normalized_shape, numbers.Integral):
+        normalized_shape = (normalized_shape,)
+    self.normalized_shape = torch.Size(normalized_shape)
+    self.eps = eps
+    self.weight = Parameter(torch.Tensor(*normalized_shape))
+    self.bias = Parameter(torch.Tensor(*normalized_shape))
+    self.reset_parameters()
+    args = get_args()
+    self.layernorm_tp_auto_sync = args.sync_tp_duplicated_parameters
+    self.use_meg_ds_fused_layer_norm = (
+      args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm
+      or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920
+    )
+  def reset_parameters(self):
+    init.ones_(self.weight)
+    init.zeros_(self.bias)
+  def forward(self, input):
+    if self.layernorm_tp_auto_sync:
+      torch.distributed.all_reduce(self.weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+      torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+    if self.use_meg_ds_fused_layer_norm:
+        return FusedLayerNormAffineFunction.apply(
+            input, self.weight, self.bias, self.normalized_shape, self.eps)
+    else:
+        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import lru_cache
+import torch
+import torch.nn as nn
+from megatron.enums import AttnMaskType
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+class ScaledSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        assert not (
+            self.input_in_fp16 and self.input_in_bf16
+        ), "both fp16 and bf16 flags cannot be active at the same time."
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+        assert (
+            self.scale is None or softmax_in_fp32
+        ), "softmax should be in fp32 when scaled"
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and 16 < sk <= 4096  # sk must be 16 ~ 4096
+            and sq % 4 == 0  # sq must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 4096:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+        if self.attn_mask_type == AttnMaskType.causal:
+            assert sq == sk, "causal mask is only for self attention"
+            # assert mask is None, "Mask is silently ignored due to the use of a custom kernel"
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            if mask is not None:
+                return ScaledMaskedSoftmax.apply(input, mask, scale)
+            else:
+                return ScaledSoftmax.apply(input, scale)
+    @staticmethod
+    @lru_cache(maxsize=1)
+    def get_causal_mask(sequence_length: int):
+        mask = torch.ones(1, 1, sequence_length, sequence_length, dtype=torch.bool, device=torch.cuda.current_device())
+        return torch.triu(mask, diagonal=1)
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+        if self.scale is not None:
+            input = input * self.scale
+        if self.attn_mask_type == AttnMaskType.causal:
+            # assert mask is None
+            assert input.shape[2] == input.shape[3]
+            mask = self.get_causal_mask(input.shape[2])
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+        return probs
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
--- a/megatron/model/glu_activations.py
+++ b/megatron/model/glu_activations.py
+import torch
+from torch import nn
+from torch.nn import functional as F
+from megatron import logging
+from megatron.model.utils import log_debug_usage
+logger = logging.get_logger(__name__)
+class _GLUBaseModule(nn.Module):
+    def __init__(self, activation_fn):
+        super().__init__()
+        self.activation_fn = activation_fn
+    def forward(self, x):
+        # dim=-1 breaks in jit for pt<1.10
+        x1, x2 = x.chunk(2, dim=(x.ndim - 1))
+        return x1 * self.activation_fn(x2)
+class LiGLU(_GLUBaseModule):
+    def __init__(self):
+        super().__init__(nn.Identity())
+class GEGLU(_GLUBaseModule):
+    def __init__(self):
+        super().__init__(F.gelu)
+class ReGLU(_GLUBaseModule):
+    def __init__(self):
+        super().__init__(F.relu)
+class SwiGLU(_GLUBaseModule):
+    def __init__(self):
+        super().__init__(F.silu)
+liglu = log_debug_usage(logger, "Using GLU activation: LiGLU.")(torch.jit.script(LiGLU()))
+geglu = log_debug_usage(logger, "Using GLU activation: GELU.")(torch.jit.script(GEGLU()))
+reglu = log_debug_usage(logger, "Using GLU activation: ReGLU.")(torch.jit.script(ReGLU()))
+swiglu = log_debug_usage(logger, "Using GLU activation: SwiGLU.")(torch.jit.script(SwiGLU()))
+GLU_ACTIVATIONS = {
+    "geglu": geglu,
+    "liglu": liglu,
+    "reglu": reglu,
+    "swiglu": swiglu,
+}
--- a/megatron/model/gpt_model.py
+++ b/megatron/model/gpt_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPT-2 model."""
+from functools import partial
+import torch
+from megatron import get_args
+from megatron import mpu
+from megatron.enums import AttnMaskType
+from .module import MegatronModule, fp32_to_float16
+from .language_model import parallel_lm_logits
+from .language_model import get_language_model
+from .utils import init_method_normal
+from .utils import scaled_init_method_normal
+from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+from megatron.model.module import float16_to_fp32
+from .language_model import EmbeddingPipe
+from .transformer import ParallelTransformerLayerPipe
+def post_language_model_processing(lm_output, labels, logit_weights,
+                                   get_key_value, parallel_output,
+                                   forward_method_parallel_output,
+                                   fp16_lm_cross_entropy):
+    if get_key_value:
+        lm_output, presents = lm_output
+    # Output.
+    if forward_method_parallel_output is not None:
+        parallel_output = forward_method_parallel_output
+    output = parallel_lm_logits(
+        lm_output,
+        logit_weights,
+        parallel_output)
+    if get_key_value:
+        output = [output, presents]
+    if labels is None:
+        return output
+    else:
+        if fp16_lm_cross_entropy:
+            assert output.dtype == torch.half
+            loss = mpu.vocab_parallel_cross_entropy(output, labels)
+        else:
+            loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
+        return loss
+class GPTModel(MegatronModule):
+    """GPT-2 Language model."""
+    def __init__(
+        self,
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True,
+        prefix_lm=False,
+    ):
+        super(GPTModel, self).__init__()
+        args = get_args()
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            # TODO: Change naming of class from GPT to something that encapsulate prefix lm.
+            encoder_attn_mask_type=AttnMaskType.prefix if prefix_lm else AttnMaskType.causal,
+            init_method=init_method_normal(args.init_method_std),
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers),
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        self.initialize_word_embeddings(init_method_normal)
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+    def forward(self, input_ids, position_ids, attention_mask, labels=None,
+                tokentype_ids=None, layer_past=None, get_key_value=False,
+                forward_method_parallel_output=None, curriculum_seqlen=None):
+        if curriculum_seqlen is not None:
+            args = get_args()
+            args.curriculum_seqlen = curriculum_seqlen
+            if curriculum_seqlen < input_ids.size()[1]:
+                # seqlen-based curriculum learning
+                # input_ids, position_ids, labels have size [batch size, seqlen]
+                input_ids = input_ids[:, :curriculum_seqlen].contiguous()
+                position_ids = position_ids[:, :curriculum_seqlen].contiguous()
+                labels = labels[:, :curriculum_seqlen].contiguous()
+                # attention_mask has size [1, 1, seqlen, seqlen]
+                attention_mask = attention_mask[:, :, :curriculum_seqlen, :curriculum_seqlen].contiguous()
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            layer_past=layer_past,
+            get_key_value=get_key_value)
+        if self.post_process:
+            return post_language_model_processing(
+                lm_output, labels,
+                self.word_embeddings_weight(),
+                get_key_value,
+                self.parallel_output,
+                forward_method_parallel_output,
+                self.fp16_lm_cross_entropy)
+        else:
+            return lm_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        # Save word_embeddings.
+        if self.post_process and not self.pre_process:
+            state_dict_[self._word_embeddings_for_head_key] \
+                = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        # Load word_embeddings.
+        if self.post_process and not self.pre_process:
+            self.word_embeddings.load_state_dict(
+                state_dict[self._word_embeddings_for_head_key], strict=strict)
+        if self._language_model_key in state_dict:
+            state_dict = state_dict[self._language_model_key]
+        self.language_model.load_state_dict(state_dict, strict=strict)
+def get_cross_entropy(is_prefix: bool):
+    def CrossEntropy(output, labels):
+        labels, loss_mask = labels[0], labels[1]
+        args = get_args()
+        losses = mpu.vocab_parallel_cross_entropy(output.contiguous().float(), labels)
+        if is_prefix:
+            micro_batch_size, sequence_length = loss_mask.shape
+            average_tokens_per_sample: torch.Tensor
+            if args.loss_on_targets_only:
+                # HACK: This is useful when we obtain loss masks that are microbatch dependent. Consequently, if we want to
+                #   preserve the notion that all tokens have the same impact on the loss, we can only normalise using a
+                #   microbatch independent value. It should be expected weight over a microbatch.
+                #   Here we still use `sequence_length`, that's batch size dependent, in order to be backwards compatible with
+                #   current experiment on vanilla gpt.
+                if args.reweight_loss_based_on_position_frequency:
+                    reweight = torch.arange(
+                        sequence_length, 0, -1, dtype=torch.float, device=loss_mask.device
+                    ) / (sequence_length + 1) * 2
+                    average_tokens_per_sample = reweight.flip(-1).cumsum(-1).mean()
+                else:
+                    average_tokens_per_sample = (sequence_length + 1) / 2
+            else:
+                average_tokens_per_sample = sequence_length
+            expected_number_of_tokens = average_tokens_per_sample * micro_batch_size
+        else:
+            expected_number_of_tokens = loss_mask.sum()
+        loss_mask = loss_mask.view(-1)
+        loss = torch.sum(losses.view(-1) * loss_mask) / expected_number_of_tokens
+        return loss
+    return CrossEntropy
+class GPTModelPipe(PipelineModule,MegatronModule):
+    """GPT-2 Language model."""
+    def __init__(
+        self,
+        num_tokentypes=0,
+        parallel_output=True,
+        attn_mask_type: AttnMaskType = AttnMaskType.causal
+    ):
+        args = get_args()
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        self.specs = []
+        def _to_float16(inputs):
+            if args.fp16:
+                return fp32_to_float16(inputs, lambda v: v.half())
+            elif args.bf16:
+                return fp32_to_float16(inputs, lambda v: v.bfloat16())
+            else:
+                return inputs
+        self.specs.append(_to_float16)
+        # Embedding layer
+        self.specs.append(TiedLayerSpec('embed',
+                                        EmbeddingPipe,
+                                        args.hidden_size,
+                                        args.padded_vocab_size,
+                                        args.hidden_dropout,
+                                        init_method=init_method,
+                                        num_tokentypes=num_tokentypes,
+                                        tied_weight_attr='word_embeddings_weight'))
+        if args.fp32_residual_connection:
+            if getattr(args, 'pretrain_causal_attention', False):
+                self.specs.append(lambda x: x.transpose(0, 1).contiguous().float())
+            else:
+                # EmbeddingPipe returns attention mask as well
+                self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous().float(), *x[1:]))
+        else:
+            if getattr(args, 'pretrain_causal_attention', False):
+                self.specs.append(lambda x: x.transpose(0, 1).contiguous())
+            else:
+                # EmbeddingPipe returns attention mask as well
+                self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous(), *x[1:]))
+        for layer_idx in range(args.num_layers):
+            self.specs.append(
+                LayerSpec(ParallelTransformerLayerPipe,
+                    init_method=init_method,
+                    output_layer_init_method=scaled_init_method_normal(args.init_method_std,
+                                                                       args.num_layers),
+                    layer_number=layer_idx,
+                    # TODO: Change naming of class from GPT to something that encapsulate prefix lm.
+                    self_attn_mask_type=attn_mask_type))
+        # Undo data format change
+        def undo(x):
+            if not getattr(args, 'pretrain_causal_attention', False):
+                x = x[0]
+            return x.transpose(0, 1).contiguous()
+        self.specs.append(undo)
+        # Final layernorm after transformer layers
+        self.specs.append(
+            LayerSpec(LayerNorm,
+                      args.hidden_size,
+                      eps=args.layernorm_epsilon))
+        def _logits_helper(embedding, lm_output):
+            """A wrapper to massage inputs/outputs from pipeline. """
+            return parallel_lm_logits(
+                lm_output,
+                embedding.word_embeddings_weight,
+                self.parallel_output)
+        self.specs.append(
+            TiedLayerSpec('embed',
+                          EmbeddingPipe,
+                          args.hidden_size,
+                          args.padded_vocab_size,
+                          args.hidden_dropout,
+                          init_method=init_method,
+                          num_tokentypes=num_tokentypes,
+                          forward_fn=_logits_helper,
+                          tied_weight_attr='word_embeddings_weight')
+        )
+        # Convert to fp32 if needed
+        if args.fp16 or args.bf16:
+            self.specs.append(float16_to_fp32)
+        if args.checkpoint_activations:
+            interval = args.checkpoint_num_layers
+        else:
+            interval = 0
+        from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
+        topo = PipeModelDataParallelTopology(num_pp=mpu.get_pipeline_model_parallel_world_size(),
+                                             num_mp=mpu.get_tensor_model_parallel_world_size(),
+                                             num_dp=mpu.get_data_parallel_world_size())
+        # here one can extend the regex to include more layers to be counted towards partitioning,
+        # e.g. 'type:transformer|embedding' will add up all the transformer blocks and also the first
+        # and last embedding layers and then partition that transformers+2 layers - so to get a good
+        # balance you may want to use less transformer layers
+        #
+        # caveat emptor: the current implementation of PP fails unless each stage has at least one
+        # transformer layer
+        if args.pp_partition_method is not None:
+            partition_method = args.pp_partition_method
+        else:
+            partition_method = 'type:transformer'
+        super().__init__(layers=self.specs,
+                         loss_fn=get_cross_entropy(is_prefix=attn_mask_type is AttnMaskType.prefix),
+                         topology=topo,
+                         activation_checkpoint_interval=interval,
+                         partition_method=partition_method)
--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer based language model."""
+import torch
+import torch.nn.functional as F
+from megatron import get_args
+from megatron import mpu
+from .module import MegatronModule
+from megatron.enums import LayerType, AttnMaskType, PositionEmbeddingType
+from megatron.model.transformer import ParallelTransformer
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal, scaled_init_method_normal
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
+                       bias=None):
+    """LM logits using word embedding weights."""
+    # Parallel logits.
+    input_parallel = mpu.copy_to_tensor_model_parallel_region(input_)
+    # Matrix multiply.
+    if bias is None:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
+    else:
+        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
+    # Gather if needed.
+    if parallel_output:
+        return logits_parallel
+    return mpu.gather_from_tensor_model_parallel_region(logits_parallel)
+def get_language_model(num_tokentypes, add_pooler,
+                       encoder_attn_mask_type, init_method=None,
+                       scaled_init_method=None, add_decoder=False,
+                       decoder_attn_mask_type=AttnMaskType.causal,
+                       pre_process=True, post_process=True):
+    """Build language model and return along with the key to save."""
+    args = get_args()
+    if init_method is None:
+        init_method = init_method_normal(args.init_method_std)
+    if scaled_init_method is None:
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+    # Language model.
+    language_model = TransformerLanguageModel(
+        init_method,
+        scaled_init_method,
+        encoder_attn_mask_type,
+        num_tokentypes=num_tokentypes,
+        add_decoder=add_decoder,
+        decoder_attn_mask_type=decoder_attn_mask_type,
+        add_pooler=add_pooler,
+        pre_process=pre_process,
+        post_process=post_process
+    )
+    # key used for checkpoints.
+    language_model_key = 'language_model'
+    return language_model, language_model_key
+class Pooler(MegatronModule):
+    """Pooler layer.
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+    def __init__(self, hidden_size, init_method):
+        super(Pooler, self).__init__()
+        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+    def forward(self, hidden_states, sequence_index=0):
+        # hidden_states: [b, s, h]
+        # sequence_index: index of the token to pool.
+        pooled = hidden_states[:, sequence_index, :]
+        pooled = self.dense(pooled)
+        pooled = torch.tanh(pooled)
+        return pooled
+class Embedding(MegatronModule):
+    """Language model embeddings.
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 embedding_dropout_prob,
+                 init_method,
+                 num_tokentypes=0):
+        super(Embedding, self).__init__()
+        self.hidden_size = hidden_size
+        self.init_method = init_method
+        self.num_tokentypes = num_tokentypes
+        args = get_args()
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            vocab_size, self.hidden_size,
+            init_method=self.init_method)
+        self._word_embeddings_key = 'word_embeddings'
+        # Position embedding (serial).
+        self.position_embedding_type = args.position_embedding_type
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            max_position_embeddings = args.max_position_embeddings
+            assert max_position_embeddings is not None
+            self.position_embeddings = torch.nn.Embedding(
+                max_position_embeddings, self.hidden_size)
+            self._position_embeddings_key = 'position_embeddings'
+            # Initialize the position embeddings.
+            self.init_method(self.position_embeddings.weight)
+        else:
+            self.position_embeddings = None
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(
+                self.num_tokentypes, self.hidden_size)
+            # Initialize the token-type embeddings.
+            self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes),
+                  flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = self.torch.nn.Embedding(num_tokentypes,
+                                                            self.hidden_size)
+        # Initialize the token-type embeddings.
+        args = get_args()
+        self.init_method(self.tokentype_embeddings.weight)
+    def forward(self, input_ids, position_ids, tokentype_ids=None):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            assert self.position_embeddings is not None
+            embeddings = embeddings + self.position_embeddings(position_ids)
+        else:
+            assert self.position_embeddings is None
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        else:
+            assert self.tokentype_embeddings is None
+        # Dropout.
+        embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(destination, prefix, keep_vars)
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            state_dict_[self._position_embeddings_key] \
+                = self.position_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(
+                    destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+        # Position embedding.
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            if self._position_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._position_embeddings_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if 'position_embeddings' in key:
+                        state_dict_[key.split('position_embeddings.')[1]] \
+                            = state_dict[key]
+            self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_,
+                                                          strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it', flush=True)
+class EmbeddingPipe(Embedding):
+    def forward(self, inputs, **kwargs):
+        if not hasattr(self, '_args'):
+            self._args = get_args()
+        input_ids = inputs[0]
+        position_ids = inputs[1]
+        if getattr(self._args, 'pretrain_causal_attention', False):
+            attention_mask = None
+        else:
+            attention_mask = inputs[2]
+        if len(inputs) == 4:
+            tokentype_ids = inputs[3]
+        else:
+            tokentype_ids = None
+        embeddings = super().forward(input_ids, position_ids, tokentype_ids=tokentype_ids)
+        # If cmd args has attn_mask, we don't forward it as an activation.
+        if getattr(self._args, 'pretrain_causal_attention', False):
+            return embeddings
+        else:
+            return embeddings, attention_mask
+    @property
+    def word_embeddings_weight(self):
+        """Easy accessory for the DeepSpeed pipeline engine to tie embeddings across stages."""
+        return self.word_embeddings.weight
+class TransformerLanguageModel(MegatronModule):
+    """Transformer language model.
+    Arguments:
+        transformer_hparams: transformer hyperparameters
+        vocab_size: vocabulary size
+        embedding_dropout_prob: dropout probability for embeddings
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+    def __init__(self,
+                 init_method,
+                 output_layer_init_method,
+                 encoder_attn_mask_type,
+                 num_tokentypes=0,
+                 add_decoder=False,
+                 decoder_attn_mask_type=AttnMaskType.causal,
+                 add_pooler=False,
+                 pre_process=True,
+                 post_process=True):
+        super(TransformerLanguageModel, self).__init__()
+        args = get_args()
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.hidden_size = args.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method = init_method
+        self.encoder_attn_mask_type = encoder_attn_mask_type
+        self.add_decoder = add_decoder
+        self.decoder_attn_mask_type = decoder_attn_mask_type
+        self.add_pooler = add_pooler
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = Embedding(self.hidden_size,
+                                       args.padded_vocab_size,
+                                       args.hidden_dropout,
+                                       self.init_method,
+                                       self.num_tokentypes)
+            self._embedding_key = 'embedding'
+        # Transformer.
+        self.encoder = ParallelTransformer(
+            self.init_method,
+            output_layer_init_method,
+            self_attn_mask_type=self.encoder_attn_mask_type,
+            pre_process=self.pre_process,
+            post_process=self.post_process
+        )
+        self._encoder_key = 'encoder'
+        # Decoder
+        if self.add_decoder:
+            assert args.pipeline_model_parallel_size == 1, \
+                'pipeline parallelism is not supported in the presence of decoder'
+            self.decoder = ParallelTransformer(
+                self.init_method,
+                output_layer_init_method,
+                layer_type=LayerType.decoder,
+                self_attn_mask_type=self.decoder_attn_mask_type)
+            self._decoder_key = 'decoder'
+        if self.post_process:
+            # Pooler.
+            if self.add_pooler:
+                self.pooler = Pooler(self.hidden_size, self.init_method)
+                self._pooler_key = 'pooler'
+    def set_input_tensor(self, input_tensor):
+        """ See megatron.model.transformer.set_input_tensor()"""
+        self.encoder.set_input_tensor(input_tensor)
+    def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
+                dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
+                enc_dec_attn_mask=None, tokentype_ids=None, layer_past=None,
+                get_key_value=False, pooling_sequence_index=0,
+                enc_hidden_states=None, output_enc_hidden=False):
+        # Embeddings.
+        if self.pre_process:
+            embedding_output = self.embedding(enc_input_ids, enc_position_ids,
+                                              tokentype_ids=tokentype_ids)
+            encoder_input = embedding_output
+        else:
+            encoder_input = None
+        # encoder.
+        if enc_hidden_states is None:
+            encoder_output = self.encoder(encoder_input,
+                                          enc_attn_mask,
+                                          layer_past=layer_past,
+                                          get_key_value=get_key_value)
+        else:
+            encoder_output = enc_hidden_states.to(encoder_input.dtype)
+        if self.post_process:
+            if self.add_pooler:
+                pooled_output = self.pooler(encoder_output,
+                                            pooling_sequence_index)
+        # output_enc_hidden refers to when we just need the encoder's
+        # output. For example, it is helpful to compute
+        # similarity between two sequences by average pooling
+        if not self.add_decoder or output_enc_hidden:
+            if self.add_pooler and self.post_process:
+                return encoder_output, pooled_output
+            else:
+                return encoder_output
+        # Decoder Embedding
+        dec_embedding_output = self.embedding(dec_input_ids,
+                                              dec_position_ids)
+        # decoder
+        decoder_output = self.decoder(dec_embedding_output,
+                                      dec_attn_mask,
+                                      layer_past=layer_past,
+                                      get_key_value=get_key_value,
+                                      encoder_output=encoder_output,
+                                      enc_dec_attn_mask=enc_dec_attn_mask)
+        if self.add_pooler and self.post_process:
+            return decoder_output, encoder_output, pooled_output
+        else:
+            return decoder_output, encoder_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load."""
+        state_dict_ = {}
+        if self.pre_process:
+            state_dict_[self._embedding_key] \
+                = self.embedding.state_dict_for_save_checkpoint(
+                    destination, prefix, keep_vars)
+        state_dict_[self._encoder_key] \
+            = self.encoder.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.post_process:
+            if self.add_pooler:
+                state_dict_[self._pooler_key] \
+                    = self.pooler.state_dict_for_save_checkpoint(
+                        destination, prefix, keep_vars)
+        if self.add_decoder:
+            state_dict_[self._decoder_key] \
+                = self.decoder.state_dict_for_save_checkpoint(
+                    destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        # Embedding.
+        if self.pre_process:
+            if self._embedding_key in state_dict:
+                state_dict_ = state_dict[self._embedding_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if '_embeddings' in key:
+                        state_dict_[key] = state_dict[key]
+            self.embedding.load_state_dict(state_dict_, strict=strict)
+        # Encoder.
+        if self._encoder_key in state_dict:
+            state_dict_ = state_dict[self._encoder_key]
+        # for backward compatibility.
+        elif 'transformer' in state_dict:
+            state_dict_ = state_dict['transformer']
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'transformer.' in key:
+                    state_dict_[key.split('transformer.')[1]] = state_dict[key]
+        # for backward compatibility.
+        state_dict_self_attention = {}
+        for key in state_dict_.keys():
+            if '.attention.' in key:
+                state_dict_self_attention[key.replace(".attention.",
+                    ".self_attention.")] = state_dict_[key]
+            else:
+                state_dict_self_attention[key] = state_dict_[key]
+        state_dict_ = state_dict_self_attention
+        self.encoder.load_state_dict(state_dict_, strict=strict)
+        if self.post_process:
+            # pooler
+            if self.add_pooler:
+                assert 'pooler' in state_dict, \
+                    'could not find data for pooler in the checkpoint'
+                self.pooler.load_state_dict(state_dict[self._pooler_key],
+                                            strict=strict)
+        # decoder
+        if self.add_decoder:
+            assert 'decoder' in state_dict, \
+                'could not find data for pooler in the checkpoint'
+            self.decoder.load_state_dict(state_dict[self._decoder_key],
+                                         strict=strict)
--- a/megatron/model/module.py
+++ b/megatron/model/module.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron Module"""
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+from megatron import get_args
+from megatron import mpu
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+class MegatronModule(torch.nn.Module):
+    """Megatron specific extensions of torch Module with support
+    for pipelining."""
+    def __init__(self, share_word_embeddings=True):
+        super(MegatronModule, self).__init__()
+        self.share_word_embeddings = share_word_embeddings
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """Use this function to override the state dict for
+        saving checkpoints."""
+        return self.state_dict(destination, prefix, keep_vars)
+    def word_embeddings_weight(self):
+        if mpu.is_pipeline_first_stage(ignore_virtual=True):
+            return self.language_model.embedding.word_embeddings.weight
+        if mpu.is_pipeline_last_stage(ignore_virtual=True):
+            if not self.share_word_embeddings:
+                raise Exception('word_embeddings_weight() called for last '
+                                'stage, but share_word_embeddings is false')
+            return self.word_embeddings.weight
+        raise Exception('word_embeddings_weight() should be '
+                        'called for first and last stage only')
+    def initialize_word_embeddings(self, init_method_normal):
+        args = get_args()
+        if not self.share_word_embeddings:
+            raise Exception('initialize_word_embeddings() was called but '
+                            'share_word_embeddings is false')
+        # This function just initializes the word embeddings in the final stage
+        # when we are using pipeline parallelism. If we aren't using pipeline
+        # parallelism there is nothing to do.
+        if args.pipeline_model_parallel_size == 1:
+            return
+        # Parameters are shared between the word embeddings layer, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+        if mpu.is_pipeline_last_stage():
+            assert not mpu.is_pipeline_first_stage()
+            self._word_embeddings_for_head_key = 'word_embeddings_for_head'
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.word_embeddings = mpu.VocabParallelEmbedding(
+                args.padded_vocab_size, args.hidden_size,
+                init_method=init_method_normal(args.init_method_std))
+            self.word_embeddings.weight.data.fill_(0)
+            self.word_embeddings.weight.shared = True
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if torch.distributed.is_initialized():
+            if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage():
+                torch.distributed.all_reduce(self.word_embeddings_weight().data,
+                                             group=mpu.get_embedding_group())
+        else:
+            print("WARNING! Distributed processes aren't initialized, so "
+                  "word embeddings in the last layer are not initialized. "
+                  "If you are just manipulating a model this is fine, but "
+                  "this needs to be handled manually. If you are training "
+                  "something is definitely wrong.")
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+def fp32_to_float16(val, float16_convertor):
+    """Convert fp32 `val` to fp16/bf16"""
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = float16_convertor(val)
+        return val
+    return conversion_helper(val, half_conversion)
+def float16_to_fp32(val):
+    """Convert fp16/bf16 `val` to fp32"""
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
+            val = val.float()
+        return val
+    return conversion_helper(val, float_conversion)
+class Float16Module(MegatronModule):
+    def __init__(self, module, args):
+        super(Float16Module, self).__init__()
+        if args.fp16:
+            self.add_module('module', module.half())
+            def float16_convertor(val):
+                return val.half()
+        elif args.bf16:
+            self.add_module('module', module.bfloat16())
+            def float16_convertor(val):
+                return val.bfloat16()
+        else:
+            raise Exception('should not be here')
+        self.float16_convertor = float16_convertor
+    def forward(self, *inputs, **kwargs):
+        if mpu.is_pipeline_first_stage():
+            inputs = fp32_to_float16(inputs, self.float16_convertor)
+        outputs = self.module(*inputs, **kwargs)
+        if mpu.is_pipeline_last_stage():
+            outputs = float16_to_fp32(outputs)
+        return outputs
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(destination, prefix,
+                                                          keep_vars)
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multiple choice model."""
+import torch
+from megatron import get_args, print_rank_last
+from megatron import mpu
+from megatron.enums import AttnMaskType
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.utils import scaled_init_method_normal
+from .module import MegatronModule
+class MultipleChoice(MegatronModule):
+    def __init__(self,
+                 num_tokentypes=2,
+                 pre_process=True,
+                 post_process=True):
+        super(MultipleChoice, self).__init__(share_word_embeddings=False)
+        args = get_args()
+        init_method = init_method_normal(args.init_method_std)
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method_normal(args.init_method_std,
+                                                         args.num_layers),
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+        # Multi-choice head.
+        if self.post_process:
+            self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
+            self.multichoice_head = get_linear_layer(args.hidden_size, 1,
+                                                     init_method)
+            self._multichoice_head_key = 'multichoice_head'
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+    def forward(self, model_input, attention_mask, tokentype_ids=None):
+        # [batch, choices, sequence] --> [batch * choices, sequence] -->
+        #    transformer --> [batch, choices] --> softmax
+        # Ensure the shape is [batch-size, choices, sequence]
+        assert len(attention_mask.shape) == 3
+        num_choices = attention_mask.shape[1]
+        # Reshape and treat choice dimension the same as batch.
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        input_ids = model_input
+        # Do the same as attention_mask for input_ids, tokentype_ids
+        assert len(input_ids.shape) == 3
+        assert len(tokentype_ids.shape) == 3
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1))
+        position_ids = bert_position_ids(input_ids)
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids
+        )
+        if self.post_process:
+            _, pooled_output = lm_output
+            multichoice_output = self.multichoice_dropout(pooled_output)
+            multichoice_logits = self.multichoice_head(multichoice_output)
+            # Reshape back to separate choices.
+            multichoice_logits = multichoice_logits.view(-1, num_choices)
+            return multichoice_logits
+        return lm_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.post_process:
+            state_dict_[self._multichoice_head_key] \
+                = self.multichoice_head.state_dict(
+                    destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        if self.post_process:
+            if self._multichoice_head_key in state_dict:
+                self.multichoice_head.load_state_dict(
+                    state_dict[self._multichoice_head_key], strict=strict)
+            else:
+                print_rank_last('***WARNING*** could not find {} in the checkpoint, '
+                                'initializing to random'.format(
+                                    self._multichoice_head_key))
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
+# Extracted from: https://github.com/EleutherAI/gpt-neox
+import torch
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, base=10000, precision=torch.half):
+        super().__init__()
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+        self.max_seq_len_cached = None
+        self.cos_cached = None
+        self.sin_cached = None
+        self.precision = precision
+    def forward(self, x, seq_dim=1, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[seq_dim]
+        if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            if self.precision == torch.bfloat16:
+                emb = emb.float()
+            # [sx, 1 (b * np), hn]
+            self.cos_cached = emb.cos()[:, None, :]
+            self.sin_cached = emb.sin()[:, None, :]
+            if self.precision == torch.bfloat16:
+                self.cos_cached = self.cos_cached.bfloat16()
+                self.sin_cached = self.sin_cached.bfloat16()
+        return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
+# rotary pos emb helpers:
+def rotate_half(x):
+    x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
+@torch.jit.script
+def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
+    cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...]
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+def apply_rotary_pos_emb_torch(q, k, cos, sin, offset: int = 0):  # jitting fails with bf16
+    cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...]
+    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
\ No newline at end of file
--- a/megatron/model/realm_model.py
+++ b/megatron/model/realm_model.py
+import os
+import torch
+from megatron import get_args, print_rank_0
+from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
+from megatron.model.bert_model import BertModel
+from .module import MegatronModule
+from megatron import mpu
+from megatron.enums import AttnMaskType
+from megatron.model.utils import get_linear_layer
+from megatron.model.utils import init_method_normal
+from megatron.model.language_model import get_language_model
+from megatron.model.utils import scaled_init_method_normal
+from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
+def general_ict_model_provider(only_query_model=False, only_block_model=False):
+    """Build the model."""
+    args = get_args()
+    assert args.ict_head_size is not None, \
+        "Need to specify --ict-head-size to provide an ICTBertModel"
+    assert mpu.get_tensor_model_parallel_world_size() == 1 and mpu.get_pipeline_model_parallel_world_size() == 1, \
+        "Model parallel size > 1 not supported for ICT"
+    print_rank_0('building ICTBertModel...')
+    # simpler to just keep using 2 tokentypes since the LM we initialize with has 2 tokentypes
+    model = ICTBertModel(
+        ict_head_size=args.ict_head_size,
+        num_tokentypes=2,
+        parallel_output=True,
+        only_query_model=only_query_model,
+        only_block_model=only_block_model)
+    return model
+class ICTBertModel(MegatronModule):
+    """Bert-based module for Inverse Cloze task."""
+    def __init__(self,
+                 ict_head_size,
+                 num_tokentypes=1,
+                 parallel_output=True,
+                 only_query_model=False,
+                 only_block_model=False):
+        super(ICTBertModel, self).__init__()
+        bert_kwargs = dict(
+            ict_head_size=ict_head_size,
+            num_tokentypes=num_tokentypes,
+            parallel_output=parallel_output
+        )
+        assert not (only_block_model and only_query_model)
+        self.use_block_model = not only_query_model
+        self.use_query_model = not only_block_model
+        if self.use_query_model:
+            # this model embeds (pseudo-)queries - Embed_input in the paper
+            self.query_model = IREncoderBertModel(**bert_kwargs)
+            self._query_key = 'question_model'
+        if self.use_block_model:
+            # this model embeds evidence blocks - Embed_doc in the paper
+            self.block_model = IREncoderBertModel(**bert_kwargs)
+            self._block_key = 'context_model'
+    def forward(self, query_tokens, query_attention_mask, block_tokens, block_attention_mask):
+        """Run a forward pass for each of the models and return the respective embeddings."""
+        query_logits = self.embed_query(query_tokens, query_attention_mask)
+        block_logits = self.embed_block(block_tokens, block_attention_mask)
+        return query_logits, block_logits
+    def embed_query(self, query_tokens, query_attention_mask):
+        """Embed a batch of tokens using the query model"""
+        if self.use_query_model:
+            query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0)
+            query_ict_logits, _ = self.query_model.forward(query_tokens, query_attention_mask, query_types)
+            return query_ict_logits
+        else:
+            raise ValueError("Cannot embed query without query model.")
+    def embed_block(self, block_tokens, block_attention_mask):
+        """Embed a batch of tokens using the block model"""
+        if self.use_block_model:
+            block_types = torch.cuda.LongTensor(*block_tokens.shape).fill_(0)
+            block_ict_logits, _ = self.block_model.forward(block_tokens, block_attention_mask, block_types)
+            return block_ict_logits
+        else:
+            raise ValueError("Cannot embed block without block model.")
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False):
+        """Save dict with state dicts of each of the models."""
+        state_dict_ = {}
+        if self.use_query_model:
+            state_dict_[self._query_key] \
+                = self.query_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        if self.use_block_model:
+            state_dict_[self._block_key] \
+                = self.block_model.state_dict_for_save_checkpoint(
+                destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Load the state dicts of each of the models"""
+        if self.use_query_model:
+            print("Loading ICT query model", flush=True)
+            self.query_model.load_state_dict(
+                state_dict[self._query_key], strict=strict)
+        if self.use_block_model:
+            print("Loading ICT block model", flush=True)
+            self.block_model.load_state_dict(
+                state_dict[self._block_key], strict=strict)
+    def init_state_dict_from_bert(self):
+        """Initialize the state from a pretrained BERT model on iteration zero of ICT pretraining"""
+        args = get_args()
+        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
+        if not os.path.isfile(tracker_filename):
+            raise FileNotFoundError("Could not find BERT load for ICT")
+        with open(tracker_filename, 'r') as f:
+            iteration = int(f.read().strip())
+            assert iteration > 0
+        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
+        if mpu.get_data_parallel_rank() == 0:
+            print('global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name))
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except BaseException:
+            raise ValueError("Could not load checkpoint")
+        # load the LM state dict into each model
+        model_dict = state_dict['model']['language_model']
+        self.query_model.language_model.load_state_dict(model_dict)
+        self.block_model.language_model.load_state_dict(model_dict)
+        # give each model the same ict_head to begin with as well
+        query_ict_head_state_dict = self.state_dict_for_save_checkpoint()[self._query_key]['ict_head']
+        self.block_model.ict_head.load_state_dict(query_ict_head_state_dict)
+class IREncoderBertModel(MegatronModule):
+    """BERT-based encoder for queries or blocks used for learned information retrieval."""
+    def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True):
+        super(IREncoderBertModel, self).__init__()
+        args = get_args()
+        self.ict_head_size = ict_head_size
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+        self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method)
+        self._ict_head_key = 'ict_head'
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+        lm_output, pooled_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids)
+        # Output.
+        ict_logits = self.ict_head(pooled_output)
+        return ict_logits, None
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        state_dict_[self._ict_head_key] \
+            = self.ict_head.state_dict(destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        self.ict_head.load_state_dict(
+            state_dict[self._ict_head_key], strict=strict)
--- a/megatron/model/t5_model.py
+++ b/megatron/model/t5_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""T5 model."""
+import torch
+from megatron import (
+    get_args,
+    mpu
+)
+from megatron.enums import AttnMaskType
+from megatron.model.language_model import parallel_lm_logits, get_language_model
+from megatron.model.transformer import LayerNorm
+from megatron.model.utils import (
+    openai_gelu,
+    get_linear_layer,
+    init_method_normal,
+    scaled_init_method_normal
+)
+from .module import MegatronModule
+def t5_extended_attention_mask(attention_mask_list):
+    def attn_mask_postprocess(attn_mask):
+        # [b, 1, s, s]
+        extended_attention_mask = attn_mask.unsqueeze(1)
+        return extended_attention_mask
+    return [attn_mask_postprocess(attn_mask) for attn_mask in attention_mask_list]
+def t5_position_ids(token_ids):
+    # Create position ids
+    seq_length = token_ids.size(1)
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=token_ids.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
+    return position_ids
+class T5LMHead(MegatronModule):
+    """Masked LM head for T5
+    Arguments:
+        mpu_vocab_size: model parallel size of vocabulary.
+        hidden_size: hidden size
+        init_method: init method for weight initialization
+        layernorm_epsilon: tolerance for layer norm divisions
+        parallel_output: wether output logits being distributed or not.
+    """
+    def __init__(self, mpu_vocab_size, parallel_output):
+        super(T5LMHead, self).__init__()
+        args = get_args()
+        self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size))
+        self.bias.model_parallel = True
+        self.bias.partition_dim = 0
+        self.bias.stride = 1
+        self.parallel_output = parallel_output
+    def forward(self, hidden_states, word_embeddings_weight):
+        output = parallel_lm_logits(hidden_states,
+                                    word_embeddings_weight,
+                                    self.parallel_output,
+                                    bias=self.bias)
+        return output
+class T5Model(MegatronModule):
+    """T5 Language model."""
+    def __init__(self, num_tokentypes=0, parallel_output=True):
+        super(T5Model, self).__init__()
+        args = get_args()
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            add_decoder=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+        self.lm_head = T5LMHead(
+            self.language_model.embedding.word_embeddings.weight.size(0),
+            parallel_output)
+        self._lm_head_key = 'lm_head'
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+    def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_mask,
+                decoder_attn_mask, encoder_decoder_attn_mask,
+                tokentype_ids=None, lm_labels=None, enc_hidden_states=None):
+        # Converting the attention masks to proper parameter settings
+        encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask = t5_extended_attention_mask(
+            [encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask])
+        encoder_position_ids = t5_position_ids(encoder_input_ids)
+        decoder_position_ids = t5_position_ids(decoder_input_ids)
+        lm_output = self.language_model(encoder_input_ids,
+                                        encoder_position_ids,
+                                        encoder_attn_mask,
+                                        decoder_input_ids,
+                                        decoder_position_ids,
+                                        decoder_attn_mask,
+                                        encoder_decoder_attn_mask,
+                                        tokentype_ids=tokentype_ids,
+                                        enc_hidden_states=enc_hidden_states)
+        decoder_output, encoder_output = lm_output
+        # Output.
+        lm_logits = self.lm_head(decoder_output,
+                                 self.language_model.embedding.word_embeddings.weight)
+        if lm_labels is None:
+            return lm_logits, encoder_output
+        else:
+            if self.fp16_lm_cross_entropy:
+                assert lm_logits.dtype == torch.half
+                lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
+            else:
+                lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
+                                                           lm_labels)
+            return lm_loss, encoder_output
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        state_dict_[self._lm_head_key] \
+            = self.lm_head.state_dict_for_save_checkpoint(
+            destination, prefix, keep_vars)
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        self.lm_head.load_state_dict(state_dict[self._lm_head_key],
+                                     strict=strict)