transformer utils (#1181)

Co-authored-by: Piotr Bialecki <pbialecki@nvidia.com> Co-authored-by: Eddie Yan <eddiey@nvidia.com> Co-authored-by: Rishi Puri <riship@nvidia.com> Co-authored-by: Sangkug Lym <slym@nvidia.com>

transformer utils (#1181)
Co-authored-by: Piotr Bialecki <pbialecki@nvidia.com> Co-authored-by: Eddie Yan <eddiey@nvidia.com> Co-authored-by: Rishi Puri <riship@nvidia.com> Co-authored-by: Sangkug Lym <slym@nvidia.com>
365fdc18 · Masaki Kozuki · GitHub · bdac244e · 365fdc18 · 365fdc18
Unverified Commit 365fdc18 authored Oct 02, 2021 by Masaki Kozuki Committed by GitHub Oct 02, 2021
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -4,5 +4,144 @@ build
 docs/build
 *~
 __pycache__
+.vscode
+# Copied from https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
 *.so
-.vscode
\ No newline at end of file
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
--- a/apex/__init__.py
+++ b/apex/__init__.py
@@ -18,3 +18,4 @@ from . import fp16_utils
 from . import optimizers
 from . import normalization
 from . import pyprof
+from . import transformer
--- a/apex/_autocast_utils.py
+++ b/apex/_autocast_utils.py
+import torch
+def _cast_if_autocast_enabled(*args):
+    if not torch.is_autocast_enabled():
+        return args
+    else:
+        return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
--- a/apex/normalization/__init__.py
+++ b/apex/normalization/__init__.py
-from .fused_layer_norm import FusedLayerNorm
+from .fused_layer_norm import FusedLayerNorm, MixedFusedLayerNorm
--- a/apex/normalization/fused_layer_norm.py
+++ b/apex/normalization/fused_layer_norm.py
-import math
+import importlib
-import torch
 import numbers
+import torch
 from torch.nn.parameter import Parameter
 from torch.nn import init
 from torch.nn import functional as F
-import importlib
+from apex._autocast_utils import _cast_if_autocast_enabled
 global fused_layer_norm_cuda
 fused_layer_norm_cuda = None
 class FusedLayerNormAffineFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, weight, bias, normalized_shape, eps):
+        global fused_layer_norm_cuda
+        if fused_layer_norm_cuda is None:
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+        ctx.normalized_shape = normalized_shape
+        ctx.eps = eps
+        input_ = input.contiguous()
+        weight_ = weight.contiguous()
+        bias_ = bias.contiguous()
+        output, mean, invvar = fused_layer_norm_cuda.forward_affine(
+            input_, ctx.normalized_shape, weight_, bias_, ctx.eps
+        )
+        ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_, weight_, bias_, mean, invvar = ctx.saved_tensors
+        grad_input = grad_weight = grad_bias = None
+        grad_input, grad_weight, grad_bias = fused_layer_norm_cuda.backward_affine(
+            grad_output.contiguous(), mean, invvar, input_, ctx.normalized_shape, weight_, bias_, ctx.eps
+        )
+        return grad_input, grad_weight, grad_bias, None, None
+class FusedLayerNormAffineMixedDtypesFunction(FusedLayerNormAffineFunction):
+    @staticmethod
+    def forward(ctx, input, weight, bias, normalized_shape, eps):
+        global fused_layer_norm_cuda
+        if fused_layer_norm_cuda is None:
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+        ctx.normalized_shape = normalized_shape
+        ctx.eps = eps
+        input_ = input.contiguous()
+        weight_ = weight.contiguous()
+        bias_ = bias.contiguous()
+        output, mean, invvar = fused_layer_norm_cuda.forward_affine_mixed_dtypes(
+            input_, ctx.normalized_shape, weight_, bias_, ctx.eps
+        )
+        ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
+        return output
-  @staticmethod
-  def forward(ctx, input, weight, bias, normalized_shape, eps):
-    global fused_layer_norm_cuda
-    if fused_layer_norm_cuda is None:
-        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
-    ctx.normalized_shape = normalized_shape
-    ctx.eps = eps
-    input_ = input.contiguous()
-    weight_ = weight.contiguous()
-    bias_ = bias.contiguous()
-    output, mean, invvar = fused_layer_norm_cuda.forward_affine(
-        input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
-    ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
-    return output
-  @staticmethod
-  def backward(ctx, grad_output):
-    input_, weight_, bias_, mean, invvar = ctx.saved_tensors
-    grad_input = grad_weight = grad_bias = None
-    grad_input, grad_weight, grad_bias = fused_layer_norm_cuda.backward_affine(
-        grad_output.contiguous(), mean, invvar,
-        input_, ctx.normalized_shape,
-        weight_, bias_, ctx.eps)
-    return grad_input, grad_weight, grad_bias, None, None
 class FusedLayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, normalized_shape, eps):
+        global fused_layer_norm_cuda
+        if fused_layer_norm_cuda is None:
+            fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+        ctx.normalized_shape = normalized_shape
+        ctx.eps = eps
+        input_ = input.contiguous()
+        output, mean, invvar = fused_layer_norm_cuda.forward(input_, ctx.normalized_shape, ctx.eps)
+        ctx.save_for_backward(input_, mean, invvar)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_, mean, invvar = ctx.saved_tensors
+        grad_input = None
+        grad_input = fused_layer_norm_cuda.backward(
+            grad_output.contiguous(), mean, invvar, input_, ctx.normalized_shape, ctx.eps
+        )
+        return grad_input, None, None
+def fused_layer_norm_affine(input, weight, bias, normalized_shape, eps=1e-6):
+    args = _cast_if_autocast_enabled(input, weight, bias, normalized_shape, eps)
+    with torch.cuda.amp.autocast(enabled=False):
+        return FusedLayerNormAffineFunction.apply(*args)
-  @staticmethod
-  def forward(ctx, input, normalized_shape, eps):
-    global fused_layer_norm_cuda
-    if fused_layer_norm_cuda is None:
-        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
-    ctx.normalized_shape = normalized_shape
-    ctx.eps = eps
-    input_ = input.contiguous()
-    output, mean, invvar = fused_layer_norm_cuda.forward(
-        input_, ctx.normalized_shape, ctx.eps)
-    ctx.save_for_backward(input_, mean, invvar)
-    return output
-  @staticmethod
-  def backward(ctx, grad_output):
-    input_, mean, invvar = ctx.saved_tensors
-    grad_input = None
-    grad_input = fused_layer_norm_cuda.backward(
-        grad_output.contiguous(), mean, invvar,
-        input_, ctx.normalized_shape,
-        ctx.eps)
-    return grad_input, None, None
-def fused_layer_norm_affine(input, normalized_shape, weight, bias, eps=1e-6):
-    return FusedLayerNormAffineFunction.apply(input, weight, bias, normalized_shape, eps)
 def fused_layer_norm(input, normalized_shape, eps=1e-6):
-    return FusedLayerNormFunction.apply(input, normalized_shape, eps)
+    args = _cast_if_autocast_enabled(input, normalized_shape, eps)
+    with torch.cuda.amp.autocast(enabled=False):
+        return FusedLayerNormFunction.apply(*args)
+def mixed_dtype_fused_layer_norm_affine(input, weight, bias, normalized_shape, eps=1e-6):
+    args = _cast_if_autocast_enabled(input, weight, bias, normalized_shape, eps)
+    with torch.cuda.amp.autocast(enabled=False):
+        return FusedLayerNormAffineMixedDtypesFunction.apply(*args)
 class FusedLayerNorm(torch.nn.Module):
    r"""Applies Layer Normalization over a mini-batch of inputs as described in
@@ -126,8 +158,9 @@ class FusedLayerNorm(torch.nn.Module):
    .. _`Layer Normalization`: https://arxiv.org/abs/1607.06450
    """
    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
-        super(FusedLayerNorm, self).__init__()
+        super().__init__()
        global fused_layer_norm_cuda
        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
@@ -141,8 +174,8 @@ class FusedLayerNorm(torch.nn.Module):
            self.weight = Parameter(torch.Tensor(*normalized_shape))
            self.bias = Parameter(torch.Tensor(*normalized_shape))
        else:
-            self.register_parameter('weight', None)
+            self.register_parameter("weight", None)
-            self.register_parameter('bias', None)
+            self.register_parameter("bias", None)
        self.reset_parameters()
    def reset_parameters(self):
@@ -152,14 +185,34 @@ class FusedLayerNorm(torch.nn.Module):
    def forward(self, input):
        if not input.is_cuda:
-            return  F.layer_norm(
+            return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
-                input, self.normalized_shape, self.weight, self.bias, self.eps)
        if self.elementwise_affine:
-          return FusedLayerNormAffineFunction.apply(
+            return fused_layer_norm_affine(input, self.weight, self.bias, self.normalized_shape, self.eps)
-              input, self.weight, self.bias, self.normalized_shape,self.eps)
        else:
-          return FusedLayerNormFunction.apply(input, self.normalized_shape, self.eps)
+            return fused_layer_norm(input, self.normalized_shape, self.eps)
    def extra_repr(self):
-        return '{normalized_shape}, eps={eps}, ' \
+        return "{normalized_shape}, eps={eps}, " "elementwise_affine={elementwise_affine}".format(**self.__dict__)
-            'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
+# NOTE (mkozuki): Why "mixed"?
+# MixedFusedLayerNorm differs from FusedLayerNorm in that this layer norm uses parameter's dtype
+# as output tensor's dtype while FusedLayerNorm uses input tensor's dtype for output tensor's dtype.
+# See: `layer_norm_affine` and `layer_norm_affine_mixed_dtypes` in "csrc/layer_norm_cuda.cpp"
+class MixedFusedLayerNorm(FusedLayerNorm):
+    def __init__(self, normalized_shape, eps=1e-5, **kwargs):
+        if "elementwise_affine" in kwargs:
+            import warnings
+            warnings.warn("MixedFusedLayerNorm does not support `elementwise_affine` argument")
+            elementwise_affine = kwargs.pop("elementwise_affine")
+            if not elementwise_affine:
+                raise RuntimeError("MixedFusedLayerNorm does not support `elementwise_affine = False`")
+        super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=True)
+    def forward(self, input: torch.Tensor):
+        # NOTE (mkozuki): CPU path is here mainly for unittest sake.
+        if not input.is_cuda:
+            return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
+        return mixed_dtype_fused_layer_norm_affine(input, self.weight, self.bias, self.normalized_shape, self.eps)
--- a/apex/transformer/README.md
+++ b/apex/transformer/README.md
+# apex.transformer
+`apex.transformer` is a module which enables efficient large Transformer models at scale.
+`apex.transformer.tensor_parallel` is based on [NVIDIA/Megatron-LM](https://github.com/NVIDIA/Megatron-LM)'s `megatron.mpu` module.
--- a/apex/transformer/__init__.py
+++ b/apex/transformer/__init__.py
+from . import tensor_parallel
+from . import functional
+from .enums import LayerType
+from .enums import AttnType
+from .enums import AttnMaskType
+from .parallel_state import (
+    is_unitialized,
+    destroy_model_parallel,
+    get_data_parallel_group,
+    get_data_parallel_rank,
+    get_data_parallel_world_size,
+    get_embedding_group,
+    get_model_parallel_group,
+    get_tensor_model_parallel_group,
+    get_pipeline_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    set_tensor_model_parallel_rank,
+    get_pipeline_model_parallel_rank,
+    set_pipeline_model_parallel_rank,
+    is_pipeline_first_stage,
+    is_pipeline_last_stage,
+    get_tensor_model_parallel_src_rank,
+    get_pipeline_model_parallel_first_rank,
+    get_pipeline_model_parallel_last_rank,
+    get_pipeline_model_parallel_next_rank,
+    get_pipeline_model_parallel_prev_rank,
+    get_tensor_model_parallel_world_size,
+    set_tensor_model_parallel_world_size,
+    get_pipeline_model_parallel_world_size,
+    set_pipeline_model_parallel_world_size,
+    get_virtual_pipeline_model_parallel_rank,
+    set_virtual_pipeline_model_parallel_rank,
+    initialize_model_parallel,
+    model_parallel_is_initialized,
+)
--- a/apex/transformer/enums.py
+++ b/apex/transformer/enums.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import enum
+class LayerType(enum.Enum):
+    encoder = 1
+    decoder = 2
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
--- a/apex/transformer/functional/__init__.py
+++ b/apex/transformer/functional/__init__.py
+from .fused_softmax import FusedScaleMaskSoftmax
+__all__ = [
+    "FusedScaleMaskSoftmax",
+]
--- a/apex/transformer/functional/fused_softmax.py
+++ b/apex/transformer/functional/fused_softmax.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from apex._autocast_utils import _cast_if_autocast_enabled
+from ..enums import AttnMaskType
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    @torch.cuda.amp.custom_bwd
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None
+def scaled_upper_triang_masked_softmax(inputs, _, scale):
+    b, np, sq, sk = inputs.size()
+    assert sq == sk, "causal mask is only for self attention"
+    # Reshaping input to 3D tensor (attn_batches, sq, sk)
+    inputs = inputs.view(-1, sq, sk)
+    args = _cast_if_autocast_enabled(inputs, scale)
+    with torch.cuda.amp.autocast(enabled=False):
+        probs = ScaledUpperTriangMaskedSoftmax.apply(*args)
+    return probs.view(b, np, sq, sk)
+# NOTE (mkozuki): `ScaledMaskedSoftmax` somehow doesn't work well with `torch.cuda.amp.custom_fwd`.
+# Without `cast_inputs` kwarg, somehow inputs are not cast to dtype used in the autocast context.
+# So I needed to manually write two `torch.autograd.Function` inheritances.
+# Fused operation which performs following three operations in sequence
+# 1. Scale the tensor.
+# 2. Apply the mask.
+# 3. Perform softmax.
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    @torch.cuda.amp.custom_bwd
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+def scaled_masked_softmax(inputs, mask, scale):
+    # input is 4D tensor (b, np, sq, sk)
+    args = _cast_if_autocast_enabled(inputs, mask, scale)
+    with torch.cuda.amp.autocast(enabled=False):
+        return ScaledMaskedSoftmax.apply(*args)
+class FusedScaleMaskSoftmax(torch.nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super().__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        if self.input_in_fp16 and self.input_in_bf16:
+            raise RuntimeError("both fp16 and bf16 flags cannot be active at the same time.")
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+        if not (self.scale is None or softmax_in_fp32):
+            raise RuntimeError("softmax should be in fp32 when scaled")
+        if self.scaled_masked_softmax_fusion:
+            if self.attn_mask_type == AttnMaskType.causal:
+                self.fused_softmax_func = scaled_upper_triang_masked_softmax
+            elif self.attn_mask_type == AttnMaskType.padding:
+                self.fused_softmax_func = scaled_masked_softmax
+            else:
+                raise ValueError("Invalid attn_mask_type.")
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and mask is not None  # mask tensor must not be None
+            and 16 < sk <= 2048  # sk must be 16 ~ 2048
+            and sq % 4 == 0  # sq must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 2048:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+    def forward_fused_softmax(self, input, mask):
+        # input.shape = [b, np, sq, sk]
+        scale = self.scale if self.scale is not None else 1.0
+        return self.fused_softmax_func(input, mask, scale)
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+        return probs
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
--- a/apex/transformer/parallel_state.py
+++ b/apex/transformer/parallel_state.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model and data parallel groups."""
+import torch
+# TODO (mkozuki): Consider dissecting utils as this utils import is here
+# only for ensure_divisibility
+from .tensor_parallel import utils
+# Intra-layer model parallel group that the current rank belongs to.
+_TENSOR_MODEL_PARALLEL_GROUP = None
+# Inter-layer model parallel group that the current rank belongs to.
+_PIPELINE_MODEL_PARALLEL_GROUP = None
+# Model parallel group (both intra- and pipeline) that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Embedding group.
+_EMBEDDING_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+# These values enable us to change the mpu sizes on the fly.
+_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_TENSOR_MODEL_PARALLEL_RANK = None
+_MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+# A list of global ranks for each pipeline group to ease calculation of the source
+# rank when broadcasting from the first or last pipeline stage
+_PIPELINE_GLOBAL_RANKS = None
+def is_unitialized():
+    """Useful for code segments that may be accessed with or without mpu initialization"""
+    return _DATA_PARALLEL_GROUP is None
+def initialize_model_parallel(
+    tensor_model_parallel_size_=1, pipeline_model_parallel_size_=1, virtual_pipeline_model_parallel_size_=None
+):
+    """
+    Initialize model data parallel groups.
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used to parallelize model tensor.
+        pipeline_model_parallel_size: number of GPUs used to parallelize model pipeline.
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print("> initializing tensor model parallel with size {}".format(tensor_model_parallel_size_))
+        print("> initializing pipeline model parallel with size {}".format(pipeline_model_parallel_size_))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    tensor_model_parallel_size = min(tensor_model_parallel_size_, world_size)
+    pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size)
+    # TODO (mkozuki): Consider moving `ensure_divisibility` to this file.
+    utils.ensure_divisibility(world_size, tensor_model_parallel_size * pipeline_model_parallel_size)
+    data_parallel_size = world_size // (tensor_model_parallel_size * pipeline_model_parallel_size)
+    num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
+    num_data_parallel_groups = world_size // data_parallel_size
+    if virtual_pipeline_model_parallel_size_ is not None:
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size_
+    rank = torch.distributed.get_rank()
+    # Build the data-parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, "data parallel group is already initialized"
+    all_data_parallel_group_ranks = []
+    for i in range(pipeline_model_parallel_size):
+        start_rank = i * num_pipeline_model_parallel_groups
+        end_rank = (i + 1) * num_pipeline_model_parallel_groups
+        for j in range(tensor_model_parallel_size):
+            ranks = range(start_rank + j, end_rank, tensor_model_parallel_size)
+            all_data_parallel_group_ranks.append(list(ranks))
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                _DATA_PARALLEL_GROUP = group
+    # Build the model-parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, "model parallel group is already initialized"
+    for i in range(data_parallel_size):
+        ranks = [data_parallel_group_ranks[i] for data_parallel_group_ranks in all_data_parallel_group_ranks]
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _MODEL_PARALLEL_GROUP = group
+    # Build the tensor model-parallel groups.
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    assert _TENSOR_MODEL_PARALLEL_GROUP is None, "tensor model parallel group is already initialized"
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _TENSOR_MODEL_PARALLEL_GROUP = group
+    # Build the pipeline model-parallel groups and embedding groups
+    # (first and last rank in each pipeline model-parallel group).
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    global _PIPELINE_GLOBAL_RANKS
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, "pipeline model parallel group is already initialized"
+    global _EMBEDDING_GROUP
+    assert _EMBEDDING_GROUP is None, "embedding group is already initialized"
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _PIPELINE_MODEL_PARALLEL_GROUP = group
+            _PIPELINE_GLOBAL_RANKS = ranks
+        # Setup embedding group (to exchange gradients between
+        # first and last stages).
+        if len(ranks) > 1:
+            embedding_ranks = [ranks[0], ranks[-1]]
+        else:
+            embedding_ranks = ranks
+        group = torch.distributed.new_group(embedding_ranks)
+        if rank in embedding_ranks:
+            _EMBEDDING_GROUP = group
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _TENSOR_MODEL_PARALLEL_GROUP is None or _PIPELINE_MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, "model parallel group is not initialized"
+    return _MODEL_PARALLEL_GROUP
+def get_tensor_model_parallel_group():
+    """Get the tensor model parallel group the caller rank belongs to."""
+    assert _TENSOR_MODEL_PARALLEL_GROUP is not None, "intra_layer_model parallel group is not initialized"
+    return _TENSOR_MODEL_PARALLEL_GROUP
+def get_pipeline_model_parallel_group():
+    """Get the pipeline model parallel group the caller rank belongs to."""
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, "pipeline_model parallel group is not initialized"
+    return _PIPELINE_MODEL_PARALLEL_GROUP
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, "data parallel group is not initialized"
+    return _DATA_PARALLEL_GROUP
+def get_embedding_group():
+    """Get the embedding group the caller rank belongs to."""
+    assert _EMBEDDING_GROUP is not None, "embedding group is not initialized"
+    return _EMBEDDING_GROUP
+def set_tensor_model_parallel_world_size(world_size):
+    """Set the tensor model parallel size"""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
+def set_pipeline_model_parallel_world_size(world_size):
+    """Set the pipeline model parallel size"""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+def get_pipeline_model_parallel_world_size():
+    """Return world size for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
+def set_tensor_model_parallel_rank(rank):
+    """Set tensor model parallel rank."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
+def set_pipeline_model_parallel_rank(rank):
+    """Set pipeline model parallel rank."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+def get_pipeline_model_parallel_rank():
+    """Return my rank for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
+def is_pipeline_first_stage(ignore_virtual=False):
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        if (
+            get_virtual_pipeline_model_parallel_world_size() is not None
+            and get_virtual_pipeline_model_parallel_rank() != 0
+        ):
+            return False
+    return get_pipeline_model_parallel_rank() == 0
+def is_pipeline_last_stage(ignore_virtual=False):
+    """Return True if in the last pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        virtual_pipeline_model_parallel_world_size = get_virtual_pipeline_model_parallel_world_size()
+        if virtual_pipeline_model_parallel_world_size is not None and get_virtual_pipeline_model_parallel_rank() != (
+            virtual_pipeline_model_parallel_world_size - 1
+        ):
+            return False
+    return get_pipeline_model_parallel_rank() == (get_pipeline_model_parallel_world_size() - 1)
+def get_virtual_pipeline_model_parallel_rank():
+    """Return the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+def set_virtual_pipeline_model_parallel_rank(rank):
+    """Set the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = rank
+def get_virtual_pipeline_model_parallel_world_size():
+    """Return the virtual pipeline-parallel world size."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+def get_tensor_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_tensor_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+def get_pipeline_model_parallel_first_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    return _PIPELINE_GLOBAL_RANKS[0]
+def get_pipeline_model_parallel_last_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    last_rank_local = get_pipeline_model_parallel_world_size() - 1
+    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
+def get_pipeline_model_parallel_next_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
+def get_pipeline_model_parallel_prev_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    _TENSOR_MODEL_PARALLEL_GROUP = None
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    _PIPELINE_MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
+    global _EMBEDDING_GROUP
+    _EMBEDDING_GROUP = None
--- a/apex/transformer/tensor_parallel/__init__.py
+++ b/apex/transformer/tensor_parallel/__init__.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model parallel utility interface."""
+from .cross_entropy import vocab_parallel_cross_entropy
+from .data import broadcast_data
+from .layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+    set_tensor_model_parallel_attributes,
+    set_defaults_if_not_set_tensor_model_parallel_attributes,
+    copy_tensor_model_parallel_attributes,
+)
+from .mappings import (
+    copy_to_tensor_model_parallel_region,
+    gather_from_tensor_model_parallel_region,
+    reduce_from_tensor_model_parallel_region,
+    scatter_to_tensor_model_parallel_region,
+)
+from .random import (
+    checkpoint,
+    get_cuda_rng_tracker,
+    init_checkpointed_activations_memory_buffer,
+    model_parallel_cuda_manual_seed,
+    reset_checkpointed_activations_memory_buffer,
+    gather_split_1d_tensor,
+    split_tensor_into_1d_equal_chunks,
+)
+from .utils import divide, split_tensor_along_last_dim
+__all__ = [
+    # cross_entropy.py
+    "vocab_parallel_cross_entropy",
+    # data.py
+    "broadcast_data",
+    # layers.py
+    "ColumnParallelLinear",
+    "RowParallelLinear",
+    "VocabParallelEmbedding",
+    "set_tensor_model_parallel_attributes",
+    "set_defaults_if_not_set_tensor_model_parallel_attributes",
+    "copy_tensor_model_parallel_attributes",
+    # mappings.py
+    "copy_to_tensor_model_parallel_region",
+    "gather_from_tensor_model_parallel_region",
+    "reduce_from_tensor_model_parallel_region",
+    "scatter_to_tensor_model_parallel_region",
+    # random.py
+    "checkpoint",
+    "get_cuda_rng_tracker",
+    "init_checkpointed_activations_memory_buffer",
+    "model_parallel_cuda_manual_seed",
+    "reset_checkpointed_activations_memory_buffer",
+    "gather_split_1d_tensor",
+    "split_tensor_into_1d_equal_chunks",
+    # utils.py
+    "divide",
+    "split_tensor_along_last_dim",
+]
--- a/apex/transformer/tensor_parallel/cross_entropy.py
+++ b/apex/transformer/tensor_parallel/cross_entropy.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from ..parallel_state import get_tensor_model_parallel_group
+from ..parallel_state import get_tensor_model_parallel_rank
+from ..parallel_state import get_tensor_model_parallel_world_size
+from .utils import VocabUtility
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, vocab_parallel_logits, target):
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(
+            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
+        )
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+        # Get the partition's vocab indecies
+        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = get_tensor_model_parallel_rank()
+        world_size = get_tensor_model_parallel_world_size()
+        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(
+            predicted_logits, op=torch.distributed.ReduceOp.SUM, group=get_tensor_model_parallel_group()
+        )
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(
+            sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=get_tensor_model_parallel_group()
+        )
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+        return loss
+    @staticmethod
+    def backward(ctx, grad_output):
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float()
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+        return grad_input, None
+def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
+    """Helper function for the cross entropy."""
+    return _VocabParallelCrossEntropy.apply(torch.clone(vocab_parallel_logits), target)
--- a/apex/transformer/tensor_parallel/data.py
+++ b/apex/transformer/tensor_parallel/data.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from ..parallel_state import get_tensor_model_parallel_group
+from ..parallel_state import get_tensor_model_parallel_rank
+from ..parallel_state import get_tensor_model_parallel_src_rank
+_MAX_DATA_DIM = 5
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert data[key].dtype == target_dtype, "{} has data type {} which " "is different than {}".format(
+            key, data[key].dtype, target_dtype
+        )
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [0 for _ in range(max_dim) for _ in keys]
+    # Pack the sizes on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, "you should increase MAX_DATA_DIM"
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.cuda.LongTensor(sizes)
+    torch.distributed.broadcast(
+        sizes_cuda, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group(),
+    )
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] > 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+    return key_size, key_numel, total_numel
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+    Arguments:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
+    # Pack on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat([data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(total_numel, device=torch.cuda.current_device(), dtype=datatype)
+    # Broadcast
+    torch.distributed.broadcast(
+        flatten_data, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group(),
+    )
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+    return output
--- a/apex/transformer/tensor_parallel/layers.py
+++ b/apex/transformer/tensor_parallel/layers.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+from ..parallel_state import get_tensor_model_parallel_group
+from ..parallel_state import get_tensor_model_parallel_rank
+from ..parallel_state import get_tensor_model_parallel_world_size
+from .mappings import copy_to_tensor_model_parallel_region
+from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import reduce_from_tensor_model_parallel_region
+from .mappings import scatter_to_tensor_model_parallel_region
+from .random import get_cuda_rng_tracker
+from .utils import divide
+from .utils import VocabUtility
+_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {
+    "tensor_model_parallel": False,
+    "partition_dim": -1,
+    "partition_stride": 1,
+}
+def param_is_not_tensor_parallel_duplicate(param):
+    return (hasattr(param, "tensor_model_parallel") and param.tensor_model_parallel) or (
+        get_tensor_model_parallel_rank() == 0
+    )
+def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
+    # Make sure the attributes are not set.
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        assert not hasattr(tensor, attribute)
+    # Set the attributes.
+    setattr(tensor, "tensor_model_parallel", is_parallel)
+    setattr(tensor, "partition_dim", dim)
+    setattr(tensor, "partition_stride", stride)
+def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
+    def maybe_set(attribute, value):
+        if not hasattr(tensor, attribute):
+            setattr(tensor, attribute, value)
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
+def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
+    def maybe_copy(attribute):
+        if hasattr(source_tensor, attribute):
+            setattr(destination_tensor, attribute, getattr(source_tensor, attribute))
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_copy(attribute)
+def _initialize_affine_weight_gpu(weight, init_method, partition_dim, stride=1):
+    """Initialize affine weight for model parallel on GPU."""
+    set_tensor_model_parallel_attributes(tensor=weight, is_parallel=True, dim=partition_dim, stride=stride)
+    with get_cuda_rng_tracker().fork():
+        init_method(weight)
+# TODO (mkozuki): Re-consider removing params_dtype from arguments to make this
+# more parallel with _initialize_affine_weight_gpu
+def _initialize_affine_weight_cpu(
+    weight,
+    output_size,
+    input_size,
+    per_partition_size,
+    partition_dim,
+    init_method,
+    stride=1,
+    return_master_weight=False,
+    *,
+    params_dtype=torch.float32,
+):
+    """Initialize affine weight for model parallel.
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+    set_tensor_model_parallel_attributes(tensor=weight, is_parallel=True, dim=partition_dim, stride=stride)
+    # Initialize master weight
+    master_weight = torch.empty(output_size, input_size, dtype=torch.float, requires_grad=False)
+    init_method(master_weight)
+    master_weight = master_weight.to(dtype=params_dtype)
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim)
+    rank = get_tensor_model_parallel_rank()
+    world_size = get_tensor_model_parallel_world_size()
+    my_weight_list = weight_list[rank::world_size]
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+    def __init__(
+        self, num_embeddings, embedding_dim, init_method=init.xavier_normal_, *, params_dtype=torch.float32, use_cpu_initialization=False,
+    ):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.0
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.vocab_start_index, self.vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
+            self.num_embeddings, get_tensor_model_parallel_rank(), self.tensor_model_parallel_size
+        )
+        self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
+        # Allocate weights and initialize.
+        if use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(self.num_embeddings_per_partition, self.embedding_dim, dtype=params_dtype)
+            )
+            _initialize_affine_weight_cpu(
+                self.weight, self.num_embeddings, self.embedding_dim, self.num_embeddings_per_partition, 0, init_method,
+                params_dtype=params_dtype,
+            )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.num_embeddings_per_partition,
+                    self.embedding_dim,
+                    device=torch.cuda.current_device(),
+                    dtype=params_dtype,
+                )
+            )
+            _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
+    def forward(self, input_):
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(
+            masked_input,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_tensor_model_parallel_region(output_parallel)
+        return output
+class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
+    """
+    Column-parallel linear layer execution with asynchronous all-reduce
+    execution in backprop.
+    """
+    @staticmethod
+    def forward(ctx, input, weight, bias):
+        ctx.save_for_backward(input, weight)
+        ctx.use_bias = bias is not None
+        output = torch.matmul(input, weight.t())
+        if bias is not None:
+            output = output + bias
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.saved_tensors
+        use_bias = ctx.use_bias
+        grad_input = grad_output.matmul(weight)
+        # Asyncronous all-reduce
+        handle = torch.distributed.all_reduce(
+                grad_input, group=get_tensor_model_parallel_group(), async_op=True)
+        # Delay the start of weight gradient computation shortly (3us) to have
+        # all-reduce scheduled first and have GPU resources allocated
+        _ = torch.empty(1, device=grad_output.device) + 1
+        grad_weight = grad_output.t().matmul(input)
+        grad_bias = grad_output.sum(dim=0) if use_bias else None
+        handle.wait()
+        return grad_input, grad_weight, grad_bias
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gether on output and make Y avaiable
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip
+                       adding bias but instead return it.
+    """
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        bias=True,
+        gather_output=True,
+        init_method=init.xavier_normal_,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        *,
+        no_async_tensor_model_parallel_allreduce=False,
+        params_dtype=torch.float32,
+        use_cpu_initialization=False,
+    ):
+        super(ColumnParallelLinear, self).__init__()
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+        self.skip_bias_add = skip_bias_add
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size_per_partition, self.input_size, dtype=params_dtype))
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight,
+                self.output_size,
+                self.input_size,
+                self.output_size_per_partition,
+                0,
+                init_method,
+                stride=stride,
+                return_master_weight=keep_master_weight_for_test,
+                params_dtype=params_dtype,
+            )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size_per_partition,
+                    self.input_size,
+                    device=torch.cuda.current_device(),
+                    dtype=params_dtype,
+                )
+            )
+            _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=stride)
+        if bias:
+            if use_cpu_initialization:
+                self.bias = Parameter(torch.empty(self.output_size_per_partition, dtype=params_dtype))
+            else:
+                self.bias = Parameter(
+                    torch.empty(self.output_size_per_partition, device=torch.cuda.current_device(), dtype=params_dtype)
+                )
+            set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter("bias", None)
+        self.async_tensor_model_parallel_allreduce = (
+                not no_async_tensor_model_parallel_allreduce and
+                world_size > 1)
+    def forward(self, input_):
+        bias = self.bias if not self.skip_bias_add else None
+        if self.async_tensor_model_parallel_allreduce:
+            input_shape = input_.shape
+            input_ = input_.view(input_shape[0] * input_shape[1],input_shape[2])
+            # Matrix multiply with asynchronous all-reduce execution
+            output_parallel = ColumnParallelLinearWithAsyncAllreduce.apply(
+                    input_, self.weight, bias)
+            output_parallel = output_parallel.view(
+                    input_shape[0], input_shape[1], output_parallel.shape[1])
+        else:
+            # Set up backprop all-reduce.
+            input_parallel = copy_to_tensor_model_parallel_region(input_)
+            # Matrix multiply.
+            output_parallel = F.linear(input_parallel, self.weight, bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_tensor_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimization where bias
+                       can be fused with other elementwise operations. We skip
+                       adding bias but instead return it.
+    """
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        bias=True,
+        input_is_parallel=False,
+        init_method=init.xavier_normal_,
+        stride=1,
+        keep_master_weight_for_test=False,
+        skip_bias_add=False,
+        *,
+        params_dtype=torch.float32,
+        use_cpu_initialization=False,
+    ):
+        super(RowParallelLinear, self).__init__()
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+        self.skip_bias_add = skip_bias_add
+        # as an argument to this function?
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size, self.input_size_per_partition, dtype=params_dtype))
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight,
+                self.output_size,
+                self.input_size,
+                self.input_size_per_partition,
+                1,
+                init_method,
+                stride=stride,
+                return_master_weight=keep_master_weight_for_test,
+                params_dtype=params_dtype,
+            )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size,
+                    self.input_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=params_dtype,
+                )
+            )
+            _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=1, stride=stride)
+        if bias:
+            if use_cpu_initialization:
+                self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype))
+            else:
+                self.bias = Parameter(
+                    torch.empty(self.output_size, device=torch.cuda.current_device(), dtype=params_dtype)
+                )
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        # All-reduce across all the partitions.
+        output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+        if not self.skip_bias_add:
+            output = output_ + self.bias if self.bias is not None else output_
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.bias
+        return output, output_bias
--- a/apex/transformer/tensor_parallel/mappings.py
+++ b/apex/transformer/tensor_parallel/mappings.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from ..parallel_state import get_tensor_model_parallel_group
+from ..parallel_state import get_tensor_model_parallel_world_size
+from ..parallel_state import get_tensor_model_parallel_rank
+from .utils import split_tensor_along_last_dim
+def _reduce(input_):
+    """All-reduce the input tensor across model parallel group."""
+    # Bypass the function if we are using only 1 GPU.
+    if get_tensor_model_parallel_world_size() == 1:
+        return input_
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
+    return input_
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+    # Split along last dimension.
+    input_list = split_tensor_along_last_dim(input_, world_size)
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_tensor_model_parallel_rank()
+    output = input_list[rank].contiguous()
+    return output
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = get_tensor_model_parallel_rank()
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+    return output
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-reduce the input from the model parallel region."""
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce(input_)
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split(input_)
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather(input_)
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+# -----------------
+# Helper functions.
+# -----------------
+def copy_to_tensor_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+def reduce_from_tensor_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+def scatter_to_tensor_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+def gather_from_tensor_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
--- a/apex/transformer/tensor_parallel/memory.py
+++ b/apex/transformer/tensor_parallel/memory.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+# A dictionary of all the memory buffers allocated.
+_MEM_BUFFS = dict()
+def allocate_mem_buff(name, numel, dtype, track_usage):
+    """Allocate a memory buffer."""
+    assert name not in _MEM_BUFFS, "memory buffer {} already allocated.".format(name)
+    _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
+    return _MEM_BUFFS[name]
+def get_mem_buff(name):
+    """Get the memory buffer."""
+    return _MEM_BUFFS[name]
+class MemoryBuffer:
+    """Contiguous memory buffer.
+    Allocate a contiguous memory of type `dtype` and size `numel`. It is
+    used to reduce memory fragmentation.
+    Usage: After the allocation, the `_start` index is set tot the first
+           index of the memory. A memory chunk starting from `_start` index
+           can be `allocated` for an input tensor, with the elements of the
+           tensor being coppied. The buffer can be reused by resetting the
+           `_start` index.
+    """
+    def __init__(self, name, numel, dtype, track_usage):
+        if torch.distributed.get_rank() == 0:
+            element_size = torch.tensor([], dtype=dtype).element_size()
+            print(
+                "> building the {} memory buffer with {} num elements "
+                "and {} dtype ({:.1f} MB)...".format(name, numel, dtype, numel * element_size / 1024 / 1024),
+                flush=True,
+            )
+        self.name = name
+        self.numel = numel
+        self.dtype = dtype
+        self.data = torch.empty(self.numel, dtype=self.dtype, device=torch.cuda.current_device(), requires_grad=False)
+        # Index tracking the start of the free memory.
+        self._start = 0
+        # Values used for tracking usage.
+        self.track_usage = track_usage
+        if self.track_usage:
+            self.in_use_value = 0.0
+            self.total_value = 0.0
+    def reset(self):
+        """Reset the buffer start index to the beginning of the buffer."""
+        self._start = 0
+    def is_in_use(self):
+        """Whether the current buffer hold on to any memory."""
+        return self._start > 0
+    def numel_in_use(self):
+        """Return number of elements in use."""
+        return self._start
+    def add(self, tensor):
+        """Allocate a chunk of memory from the buffer to tensor and copy
+        the values."""
+        assert tensor.dtype == self.dtype, "Input tensor type {} different from buffer type {}".format(
+            tensor.dtype, self.dtype
+        )
+        # Number of elements of the input tensor.
+        tensor_numel = torch.numel(tensor)
+        new_start = self._start + tensor_numel
+        assert new_start <= self.numel, "Not enough memory left in the buffer ({} > {})".format(
+            tensor_numel, self.numel - self._start
+        )
+        # New tensor is a view into the memory.
+        new_tensor = self.data[self._start : new_start]
+        self._start = new_start
+        new_tensor = new_tensor.view(tensor.shape)
+        new_tensor.copy_(tensor)
+        # Return a pointer to the new tensor.
+        return new_tensor
+    def get_data(self):
+        """Return the data currently in use."""
+        if self.track_usage:
+            self.in_use_value += float(self._start)
+            self.total_value += float(self.numel)
+        return self.data[: self._start]
+    def print_average_usage(self):
+        """Print memory usage average over time. We would like this value
+        to be as high as possible."""
+        assert self.track_usage, "You need to enable track usage."
+        if torch.distributed.get_rank() == 0:
+            print(
+                " > usage of {} memory buffer: {:.2f} %".format(
+                    self.name, self.in_use_value * 100.0 / self.total_value
+                ),
+                flush=True,
+            )
+class RingMemBuffer:
+    """A ring of memory buffers."""
+    def __init__(self, name, num_buffers, numel, dtype, track_usage):
+        self.num_buffers = num_buffers
+        self.buffers = [
+            allocate_mem_buff(name + " {}".format(i), numel, dtype, track_usage) for i in range(num_buffers)
+        ]
+        self._index = -1
+    def get_next_buffer(self):
+        self._index += 1
+        self._index = self._index % self.num_buffers
+        buff = self.buffers[self._index]
+        assert not buff.is_in_use(), "buffer is already in use."
+        return buff
--- a/apex/transformer/tensor_parallel/microbatches.py
+++ b/apex/transformer/tensor_parallel/microbatches.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron number of micro-batches calculators."""
+from abc import ABC
+from abc import abstractmethod
+def build_num_microbatches_calculator(args):
+    # Constant num micro-batches.
+    if args.rampup_batch_size is None:
+        num_microbatches_calculator = ConstantNumMicroBatches(
+            args.global_batch_size, args.micro_batch_size, args.data_parallel_size
+        )
+        if args.rank == 0:
+            print(
+                "setting number of micro-batches to constant {}".format(num_microbatches_calculator.get()), flush=True
+            )
+    else:
+        assert len(args.rampup_batch_size) == 3, (
+            "expected the following "
+            "format: --rampup-batch-size <start batch size> "
+            "<batch size incerement> <ramp-up samples>"
+        )
+        start_batch_size = int(args.rampup_batch_size[0])
+        batch_size_increment = int(args.rampup_batch_size[1])
+        ramup_samples = int(args.rampup_batch_size[2])
+        if args.rank == 0:
+            print(
+                "will use batch size rampup starting from global batch "
+                "size {} to global batch size {} with batch size increments "
+                "{} over {} samples.".format(
+                    start_batch_size, args.global_batch_size, batch_size_increment, ramup_samples
+                ),
+                flush=True,
+            )
+        num_microbatches_calculator = RampupBatchsizeNumMicroBatches(
+            start_batch_size,
+            batch_size_increment,
+            ramup_samples,
+            args.global_batch_size,
+            args.micro_batch_size,
+            args.data_parallel_size,
+        )
+    return num_microbatches_calculator
+class NumMicroBatchesCalculator(ABC):
+    def __init__(self):
+        self.num_micro_batches = None
+        self.current_global_batch_size = None
+    def get(self):
+        return self.num_micro_batches
+    def get_current_global_batch_size(self):
+        return self.current_global_batch_size
+    @abstractmethod
+    def update(self, consumed_samples, consistency_check):
+        pass
+class ConstantNumMicroBatches(NumMicroBatchesCalculator):
+    def __init__(self, global_batch_size, micro_batch_size, data_parallel_size):
+        micro_batch_times_data_parallel = micro_batch_size * data_parallel_size
+        assert global_batch_size % micro_batch_times_data_parallel == 0, (
+            "global batch size ({}) is not divisible by micro batch size ({})"
+            " times data parallel size ({})".format(global_batch_size, micro_batch_size, data_parallel_size)
+        )
+        self.num_micro_batches = global_batch_size // micro_batch_times_data_parallel
+        assert self.num_micro_batches >= 1
+        self.current_global_batch_size = global_batch_size
+    def update(self, consumed_samples, consistency_check):
+        pass
+class RampupBatchsizeNumMicroBatches(NumMicroBatchesCalculator):
+    def __init__(
+        self,
+        start_batch_size,
+        batch_size_increment,
+        ramup_samples,
+        global_batch_size,
+        micro_batch_size,
+        data_parallel_size,
+    ):
+        """Batch size ramp up.
+        Over
+          steps = (global-batch-size - start-batch-size) / batch_size_increment
+        increment batch size from start-batch-size to global-batch-size using
+          rampup-samples / steps
+        samples.
+        Arguments:
+            start_batch_size: global batch size to start with
+            batch_size_increment: global batch size increments
+            ramup_samples: number of samples to use ramp up global
+               batch size from `start_batch_size` to `global_batch_size`
+            global_batch_size: global batch size post rampup
+            micro_batch_size: micro batch size
+            data_parallel_size: data parallel size.
+        """
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_size = data_parallel_size
+        self.micro_batch_times_data_parallel_size = self.micro_batch_size * self.data_parallel_size
+        assert self.micro_batch_times_data_parallel_size > 0
+        assert start_batch_size > 0
+        self.start_batch_size = start_batch_size
+        assert global_batch_size > 0
+        self.global_batch_size = global_batch_size
+        diff_batch_size = self.global_batch_size - self.start_batch_size
+        assert diff_batch_size >= 0
+        assert batch_size_increment > 0
+        self.batch_size_increment = batch_size_increment
+        assert diff_batch_size % batch_size_increment == 0, (
+            "expected "
+            "global batch size interval ({}) to be divisible by global batch "
+            "size increment ({})".format(diff_batch_size, batch_size_increment)
+        )
+        num_increments = diff_batch_size // self.batch_size_increment
+        self.ramup_samples = ramup_samples
+        assert self.ramup_samples >= 0
+        self.rampup_samples_per_increment = self.ramup_samples / num_increments
+        # Initialize number of microbatches.
+        self.update(0, False)
+    def update(self, consumed_samples, consistency_check):
+        if consumed_samples > self.ramup_samples:
+            self.current_global_batch_size = self.global_batch_size
+        else:
+            steps = int(consumed_samples / self.rampup_samples_per_increment)
+            self.current_global_batch_size = self.start_batch_size + steps * self.batch_size_increment
+            assert self.current_global_batch_size <= self.global_batch_size
+        if consistency_check:
+            assert self.current_global_batch_size % self.micro_batch_times_data_parallel_size == 0, (
+                "current global "
+                "batch size ({}) is not divisible by micro-batch-size ({}) times"
+                "data parallel size ({})".format(
+                    self.current_global_batch_size, self.micro_batch_size, self.data_parallel_size
+                )
+            )
+        self.num_micro_batches = self.current_global_batch_size // self.micro_batch_times_data_parallel_size
--- a/apex/transformer/tensor_parallel/random.py
+++ b/apex/transformer/tensor_parallel/random.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+import contextlib
+import torch
+from torch import _C
+from torch.cuda import _lazy_call, device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+from ..parallel_state import get_data_parallel_rank
+from ..parallel_state import get_tensor_model_parallel_group
+from ..parallel_state import get_tensor_model_parallel_rank
+from ..parallel_state import get_tensor_model_parallel_world_size
+from .memory import allocate_mem_buff
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = "model-parallel-rng"
+# Whether apply model parallelsim to checkpointed hidden states.
+_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
+# TODO (mkozuki): Consider the possibility of removing `tensor_model_parallel_size`,
+# `get_tensor_model_parallel_world_size()` might be alternative.
+def init_checkpointed_activations_memory_buffer(
+    micro_batch_size,
+    max_position_embeddings,
+    hidden_size,
+    num_layers,
+    tensor_model_parallel_size,
+    checkpoint_num_layers,
+    fp16,
+):
+    """Initializ the memory buffer for the checkpointed activations."""
+    per_layer = micro_batch_size * max_position_embeddings * hidden_size // tensor_model_parallel_size
+    assert num_layers % checkpoint_num_layers == 0, "number of layers is not divisible by checkpoint-num-layers"
+    num_checkpointer_layers = num_layers // checkpoint_num_layers
+    numel = per_layer * num_checkpointer_layers
+    dtype = torch.half
+    if not fp16:
+        dtype = torch.float
+    global _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
+    assert (
+        _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is None
+    ), "checkpointed activations memory buffer is already allocated."
+    _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = allocate_mem_buff(
+        "checkpointed activations", numel, dtype, track_usage=False
+    )
+def reset_checkpointed_activations_memory_buffer():
+    """Reset the memory used for checkpointing."""
+    if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+        _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.reset()
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, "_cuda_setRNGState") and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device("cuda")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device("cuda", device)
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+    _lazy_call(cb)
+def split_tensor_into_1d_equal_chunks(tensor):
+    """Break a tensor into equal 1D chunks."""
+    data = tensor.view(-1)
+    partition_size = torch.numel(data) // get_tensor_model_parallel_world_size()
+    start_index = partition_size * get_tensor_model_parallel_rank()
+    end_index = start_index + partition_size
+    return data[start_index:end_index]
+def gather_split_1d_tensor(tensor):
+    """Opposite of above function, gather values from model parallel ranks."""
+    world_size = get_tensor_model_parallel_world_size()
+    numel = torch.numel(tensor)
+    numel_gathered = world_size * numel
+    gathered = torch.empty(numel_gathered, dtype=tensor.dtype, device=torch.cuda.current_device(), requires_grad=False)
+    chunks = [gathered[i * numel : (i + 1) * numel] for i in range(world_size)]
+    torch.distributed.all_gather(chunks, tensor, group=get_tensor_model_parallel_group())
+    return gathered
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception("seed {} already exists".format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception("cuda rng state {} already exists".format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception("cuda rng state {} is not added".format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-tensor-model-parallel regions.
+        tensor-model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
+    # Data parallel gets the original seed.
+    data_parallel_seed = seed
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, tensor_model_parallel_seed)
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        ctx.run_function = run_function
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+        with torch.no_grad():
+            outputs = run_function(*args)
+        # Divide hidden states across model parallel group and only keep
+        # the chunk corresponding to the current rank.
+        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+            ctx.input_0_shape = args[0].data.shape
+            args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
+            args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(args[0].data)
+        # Store everything.
+        ctx.save_for_backward(*args)
+        return outputs
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError("Checkpointing is not compatible with .grad(), " "please use .backward() if possible")
+        inputs = ctx.saved_tensors
+        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+            inputs[0].data = gather_split_1d_tensor(inputs[0].data)
+            inputs[0].data = inputs[0].data.view(ctx.input_0_shape)
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+        # Compute the forward pass.
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in detached_inputs)
+        return (None,) + grads
+def checkpoint(function, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, *args)
--- a/apex/transformer/tensor_parallel/tests/__init__.py
+++ b/apex/transformer/tensor_parallel/tests/__init__.py