transformer utils (#1181)

Co-authored-by: Piotr Bialecki <pbialecki@nvidia.com> Co-authored-by: Eddie Yan <eddiey@nvidia.com> Co-authored-by: Rishi Puri <riship@nvidia.com> Co-authored-by: Sangkug Lym <slym@nvidia.com>

transformer utils (#1181)
Co-authored-by: Piotr Bialecki <pbialecki@nvidia.com> Co-authored-by: Eddie Yan <eddiey@nvidia.com> Co-authored-by: Rishi Puri <riship@nvidia.com> Co-authored-by: Sangkug Lym <slym@nvidia.com>
365fdc18 · Masaki Kozuki · GitHub · bdac244e · 365fdc18 · 365fdc18
Unverified Commit 365fdc18 authored Oct 02, 2021 by Masaki Kozuki Committed by GitHub Oct 02, 2021
20 changed files
--- a/apex/transformer/tensor_parallel/tests/arguments.py
+++ b/apex/transformer/tensor_parallel/tests/arguments.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron arguments."""
+import argparse
+import os
+
+import torch
+
+
+def parse_args(extra_args_provider=None, defaults={},
+               ignore_unknown_args=False):
+    """Parse all arguments."""
+    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
+                                     allow_abbrev=False)
+
+    # Standard arguments.
+    parser = _add_network_size_args(parser)
+    parser = _add_regularization_args(parser)
+    parser = _add_training_args(parser)
+    parser = _add_initialization_args(parser)
+    parser = _add_learning_rate_args(parser)
+    parser = _add_checkpointing_args(parser)
+    parser = _add_mixed_precision_args(parser)
+    parser = _add_distributed_args(parser)
+    parser = _add_validation_args(parser)
+    parser = _add_data_args(parser)
+    parser = _add_autoresume_args(parser)
+    parser = _add_biencoder_args(parser)
+    parser = _add_vit_args(parser)
+    parser = _add_logging_args(parser)
+
+    # Custom arguments.
+    if extra_args_provider is not None:
+        parser = extra_args_provider(parser)
+
+    # Parse.
+    if ignore_unknown_args:
+        args, _ = parser.parse_known_args()
+    else:
+        args = parser.parse_args()
+
+    # Distributed args.
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+    # Tensor model parallel size.
+    args.tensor_model_parallel_size = min(
+        args.tensor_model_parallel_size, args.world_size)
+    assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
+        ' ({}) is not divisible by tensor model parallel size ({})'.format(
+            args.world_size, args.tensor_model_parallel_size)
+    # Pipeline model parallel size.
+    args.pipeline_model_parallel_size = min(
+        args.pipeline_model_parallel_size,
+        (args.world_size // args.tensor_model_parallel_size))
+    # Checks.
+    model_parallel_size = args.pipeline_model_parallel_size * \
+                          args.tensor_model_parallel_size
+    assert args.world_size % model_parallel_size == 0, 'world size is not'\
+        ' divisible by tensor parallel size ({}) times pipeline parallel ' \
+        'size ({})'.format(args.world_size, args.tensor_model_parallel_size,
+                           args.pipeline_model_parallel_size)
+    args.data_parallel_size = args.world_size // model_parallel_size
+    if args.rank == 0:
+        print('using world size: {}, data-parallel-size: {}, '
+              'tensor-model-parallel size: {}, '
+              'pipeline-model-parallel size: {} '.format(
+                  args.world_size, args.data_parallel_size,
+                  args.tensor_model_parallel_size,
+                  args.pipeline_model_parallel_size), flush=True)
+
+    # Deprecated arguments
+    assert args.batch_size is None, '--batch-size argument is no longer ' \
+        'valid, use --micro-batch-size instead'
+    del args.batch_size
+    assert args.warmup is None, '--warmup argument is no longer valid, use ' \
+        '--lr-warmup-fraction instead'
+    del args.warmup
+    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
+        'longer valid, use --tensor-model-parallel-size instead'
+    del args.model_parallel_size
+
+    # Set input defaults.
+    for key in defaults:
+        # For default to be valid, it should not be provided in the
+        # arguments that are passed to the program. We check this by
+        # ensuring the arg is set to None.
+        if getattr(args, key) is not None:
+            if args.rank == 0:
+                print('WARNING: overriding default arguments for {key}:{v} \
+                       with {key}:{v2}'.format(key=key, v=defaults[key],
+                                               v2=getattr(args, key)),
+                                               flush=True)
+        else:
+            setattr(args, key, defaults[key])
+
+    # Batch size.
+    assert args.micro_batch_size is not None
+    assert args.micro_batch_size > 0
+    if args.global_batch_size is None:
+        args.global_batch_size = args.micro_batch_size * args.data_parallel_size
+        if args.rank == 0:
+            print('setting global batch size to {}'.format(
+                args.global_batch_size), flush=True)
+    assert args.global_batch_size > 0
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        assert args.pipeline_model_parallel_size > 2, \
+            'pipeline-model-parallel size should be greater than 2 with ' \
+            'interleaved schedule'
+        assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, \
+            'number of layers is not divisible by number of layers per virtual ' \
+            'pipeline stage'
+        args.virtual_pipeline_model_parallel_size = \
+            (args.num_layers // args.pipeline_model_parallel_size) // \
+            args.num_layers_per_virtual_pipeline_stage
+    else:
+        args.virtual_pipeline_model_parallel_size = None
+
+    # Parameters dtype.
+    args.params_dtype = torch.float
+    if args.fp16:
+        assert not args.bf16
+        args.params_dtype = torch.half
+    if args.bf16:
+        assert not args.fp16
+        args.params_dtype = torch.bfloat16
+        # bfloat16 requires gradient accumulation and all-reduce to
+        # be done in fp32.
+        if not args.accumulate_allreduce_grads_in_fp32:
+            args.accumulate_allreduce_grads_in_fp32 = True
+            if args.rank == 0:
+                print('accumulate and all-reduce gradients in fp32 for '
+                      'bfloat16 data type.', flush=True)
+
+    if args.rank == 0:
+        print('using {} for parameters ...'.format(args.params_dtype),
+              flush=True)
+
+    # If we do accumulation and all-reduces in fp32, we need to have
+    # local DDP and we should set the use-contiguous-buffers-in-ddp.
+    if args.accumulate_allreduce_grads_in_fp32:
+        assert args.DDP_impl == 'local'
+        args.use_contiguous_buffers_in_ddp = True
+
+    # If we use a contiguous buffer to hold main grads, we need to have
+    # local DDP.
+    if args.use_contiguous_buffers_in_ddp:
+        assert args.DDP_impl == 'local'
+
+    if args.dataloader_type is None:
+        args.dataloader_type = 'single'
+
+    # Consumed tokens.
+    args.consumed_train_samples = 0
+    args.consumed_valid_samples = 0
+
+    # Iteration-based training.
+    if args.train_iters:
+        # If we use iteration-based training, make sure the
+        # sample-based options are off.
+        assert args.train_samples is None, \
+            'expected iteration-based training'
+        assert args.lr_decay_samples is None, \
+            'expected iteration-based learning rate decay'
+        assert args.lr_warmup_samples == 0, \
+            'expected iteration-based learning rate warmup'
+        assert args.rampup_batch_size is None, \
+            'expected no batch-size rampup for iteration-based training'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_iters == 0, \
+                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
+
+    # Sample-based training.
+    if args.train_samples:
+        # If we use sample-based training, make sure the
+        # iteration-based options are off.
+        assert args.train_iters is None, \
+            'expected sample-based training'
+        assert args.lr_decay_iters is None, \
+            'expected sample-based learning rate decay'
+        assert args.lr_warmup_iters == 0, \
+            'expected sample-based learnig rate warmup'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_samples == 0, \
+                'can only specify one of lr-warmup-fraction ' \
+                'and lr-warmup-samples'
+
+    # Check required arguments.
+    required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
+                     'max_position_embeddings']
+    for req_arg in required_args:
+        _check_arg_is_not_none(args, req_arg)
+
+    # Checks.
+    if args.ffn_hidden_size is None:
+        args.ffn_hidden_size = 4 * args.hidden_size
+
+    if args.kv_channels is None:
+        assert args.hidden_size % args.num_attention_heads == 0
+        args.kv_channels = args.hidden_size // args.num_attention_heads
+
+    if args.seq_length is not None:
+        assert args.encoder_seq_length is None
+        args.encoder_seq_length = args.seq_length
+    else:
+        assert args.encoder_seq_length is not None
+        args.seq_length = args.encoder_seq_length
+
+    if args.seq_length is not None:
+        assert args.max_position_embeddings >= args.seq_length
+    if args.decoder_seq_length is not None:
+        assert args.max_position_embeddings >= args.decoder_seq_length
+    if args.lr is not None:
+        assert args.min_lr <= args.lr
+    if args.save is not None:
+        assert args.save_interval is not None
+    # Mixed precision checks.
+    if args.fp16_lm_cross_entropy:
+        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
+    if args.fp32_residual_connection:
+        assert args.fp16 or args.bf16, \
+            'residual connection in fp32 only supported when using fp16 or bf16.'
+    # Activation checkpointing.
+    if args.distribute_checkpointed_activations:
+        assert args.checkpoint_activations, \
+            'for distribute-checkpointed-activations to work you '\
+            'need to enable checkpoint-activations'
+
+    _print_args(args)
+    return args
+
+
+def _print_args(args):
+    """Print arguments."""
+    if args.rank == 0:
+        print('------------------------ arguments ------------------------',
+              flush=True)
+        str_list = []
+        for arg in vars(args):
+            dots = '.' * (48 - len(arg))
+            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
+        for arg in sorted(str_list, key=lambda x: x.lower()):
+            print(arg, flush=True)
+        print('-------------------- end of arguments ---------------------',
+              flush=True)
+
+
+def _check_arg_is_not_none(args, arg):
+    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
+
+
+def _add_network_size_args(parser):
+    group = parser.add_argument_group(title='network size')
+
+    group.add_argument('--num-layers', type=int, default=None,
+                       help='Number of transformer layers.')
+    group.add_argument('--hidden-size', type=int, default=None,
+                       help='Tansformer hidden size.')
+    group.add_argument('--ffn-hidden-size', type=int, default=None,
+                       help='Transformer Feed-Forward Network hidden size. '
+                       'This is set to 4*hidden-size if not provided')
+    group.add_argument('--num-attention-heads', type=int, default=None,
+                       help='Number of transformer attention heads.')
+    group.add_argument('--kv-channels', type=int, default=None,
+                       help='Projection weights dimension in multi-head '
+                       'attention. This is set to '
+                       '   args.hidden_size // args.num_attention_heads '
+                       'if not provided.')
+    group.add_argument('--max-position-embeddings', type=int, default=None,
+                       help='Maximum number of position embeddings to use. '
+                       'This is the size of position embedding.')
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                       'This is added for computational efficieny reasons.')
+    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
+                       help='Layer norm epsilon.')
+    group.add_argument('--apply-residual-connection-post-layernorm',
+                       action='store_true',
+                       help='If set, use original BERT residula connection '
+                       'ordering.')
+    group.add_argument('--openai-gelu', action='store_true',
+                       help='Use OpenAIs GeLU implementation. This option'
+                       'should not be used unless for backward compatibility'
+                       'reasons.')
+    group.add_argument('--onnx-safe', type=bool, required=False,
+                       help='Use workarounds for known problems with '
+                       'Torch ONNX exporter')
+    group.add_argument('--bert-no-binary-head', action='store_false',
+                       help='Disable BERT binary head.',
+                       dest='bert_binary_head')
+
+    return parser
+
+
+def _add_logging_args(parser):
+    group = parser.add_argument_group(title='logging')
+
+    group.add_argument('--log-params-norm', action='store_true',
+                       help='If set, calculate and log parameters norm.')
+    group.add_argument('--log-num-zeros-in-grad', action='store_true',
+                       help='If set, calculate and log the number of zeros in gradient.')
+    group.add_argument('--tensorboard-log-interval', type=int, default=1,
+                       help='Report to tensorboard interval.')
+    group.add_argument('--tensorboard-queue-size', type=int, default=1000,
+                       help='Size of the tensorboard queue for pending events '
+                       'and summaries before one of the ‘add’ calls forces a '
+                       'flush to disk.')
+    group.add_argument('--log-timers-to-tensorboard', action='store_true',
+                       help='If set, write timers to tensorboard.')
+    group.add_argument('--log-batch-size-to-tensorboard', action='store_true',
+                       help='If set, write batch-size to tensorboard.')
+    group.add_argument('--no-log-learnig-rate-to-tensorboard',
+                       action='store_false',
+                       help='Disable learning rate logging to tensorboard.',
+                       dest='log_learning_rate_to_tensorboard')
+    group.add_argument('--no-log-loss-scale-to-tensorboard',
+                       action='store_false',
+                       help='Disable loss-scale logging to tensorboard.',
+                       dest='log_loss_scale_to_tensorboard')
+    group.add_argument('--log-validation-ppl-to-tensorboard',
+                       action='store_true',
+                       help='If set, write validation perplexity to '
+                       'tensorboard.')
+    group.add_argument('--log-memory-to-tensorboard',
+                       action='store_true',
+                       help='Enable memory logging to tensorboard.')
+
+    return parser
+
+
+def _add_regularization_args(parser):
+    group = parser.add_argument_group(title='regularization')
+
+    group.add_argument('--attention-dropout', type=float, default=0.1,
+                       help='Post attention dropout probability.')
+    group.add_argument('--hidden-dropout', type=float, default=0.1,
+                       help='Dropout probability for hidden state transformer.')
+    group.add_argument('--weight-decay', type=float, default=0.01,
+                       help='Weight decay coefficient for L2 regularization.')
+    group.add_argument('--clip-grad', type=float, default=1.0,
+                       help='Gradient clipping based on global L2 norm.')
+    group.add_argument('--adam-beta1', type=float, default=0.9,
+                       help='First coefficient for computing running averages '
+                       'of gradient and its square')
+    group.add_argument('--adam-beta2', type=float, default=0.999,
+                       help='Second coefficient for computing running averages '
+                       'of gradient and its square')
+    group.add_argument('--adam-eps', type=float, default=1e-08,
+                       help='Term added to the denominator to improve'
+                       'numerical stability')
+    group.add_argument('--sgd-momentum', type=float, default=0.9,
+                       help='Momentum factor for sgd')
+
+    return parser
+
+
+def _add_training_args(parser):
+    group = parser.add_argument_group(title='training')
+
+    group.add_argument('--micro-batch-size', type=int, default=None,
+                       help='Batch size per model instance (local batch size). '
+                       'Global batch size is local batch size times data '
+                       'parallel size times number of micro batches.')
+    group.add_argument('--batch-size', type=int, default=None,
+                       help='Old batch size parameter, do not use. '
+                       'Use --micro-batch-size instead')
+    group.add_argument('--global-batch-size', type=int, default=None,
+                       help='Training batch size. If set, it should be a '
+                       'multiple of micro-batch-size times data-parallel-size. '
+                       'If this value is None, then '
+                       'use micro-batch-size * data-parallel-size as the '
+                       'global batch size. This choice will result in 1 for '
+                       'number of micro-batches.')
+    group.add_argument('--rampup-batch-size', nargs='*', default=None,
+                       help='Batch size ramp up with the following values:'
+                       '  --rampup-batch-size <start batch size> '
+                       '                      <batch size incerement> '
+                       '                      <ramp-up samples> '
+                       'For example:'
+                       '   --rampup-batch-size 16 8 300000 \ '
+                       '   --global-batch-size 1024'
+                       'will start with global batch size 16 and over '
+                       ' (1024 - 16) / 8 = 126 intervals will increase'
+                       'the batch size linearly to 1024. In each interval'
+                       'we will use approximately 300000 / 126 = 2380 samples.')
+    group.add_argument('--checkpoint-activations', action='store_true',
+                       help='Checkpoint activation to allow for training '
+                       'with larger models, sequences, and batch sizes.')
+    group.add_argument('--distribute-checkpointed-activations',
+                       action='store_true',
+                       help='If set, distribute checkpointed activations '
+                       'across model parallel group.')
+    group.add_argument('--checkpoint-num-layers', type=int, default=1,
+                       help='chunk size (number of layers) for checkpointing.')
+    group.add_argument('--train-iters', type=int, default=None,
+                       help='Total number of iterations to train over all '
+                       'training runs. Note that either train-iters or '
+                       'train-samples should be provided.')
+    group.add_argument('--train-samples', type=int, default=None,
+                       help='Total number of samples to train over all '
+                       'training runs. Note that either train-iters or '
+                       'train-samples should be provided.')
+    group.add_argument('--log-interval', type=int, default=100,
+                       help='Report loss and timing interval.')
+    group.add_argument('--exit-interval', type=int, default=None,
+                       help='Exit the program after the iteration is divisible '
+                       'by this value.')
+    group.add_argument('--exit-duration-in-mins', type=int, default=None,
+                       help='Exit the program after this many minutes.')
+    group.add_argument('--tensorboard-dir', type=str, default=None,
+                       help='Write TensorBoard logs to this directory.')
+    group.add_argument('--no-masked-softmax-fusion',
+                       action='store_false',
+                       help='Disable fusion of query_key_value scaling, '
+                       'masking, and softmax.',
+                       dest='masked_softmax_fusion')
+    group.add_argument('--no-bias-gelu-fusion', action='store_false',
+                       help='Disable bias and gelu fusion.',
+                       dest='bias_gelu_fusion')
+    group.add_argument('--no-bias-dropout-fusion', action='store_false',
+                       help='Disable bias and dropout fusion.',
+                       dest='bias_dropout_fusion')
+    group.add_argument('--optimizer', type=str, default='adam',
+                       choices=['adam', 'sgd'],
+                       help='Optimizer function')
+    group.add_argument('--dataloader-type', type=str, default=None,
+                       choices=['single', 'cyclic'],
+                       help='Single pass vs multiple pass data loader')
+    return parser
+
+
+def _add_initialization_args(parser):
+    group = parser.add_argument_group(title='initialization')
+
+    group.add_argument('--seed', type=int, default=1234,
+                       help='Random seed used for python, numpy, '
+                       'pytorch, and cuda.')
+    group.add_argument('--init-method-std', type=float, default=0.02,
+                       help='Standard deviation of the zero mean normal '
+                       'distribution used for weight initialization.')
+    group.add_argument('--init-method-xavier-uniform', action='store_true',
+                       help='Enable Xavier uniform parameter initialization')
+
+    return parser
+
+
+def _add_learning_rate_args(parser):
+    group = parser.add_argument_group(title='learning rate')
+
+    group.add_argument('--lr', type=float, default=None,
+                       help='Initial learning rate. Depending on decay style '
+                       'and initial warmup, the learing rate at each '
+                       'iteration would be different.')
+    group.add_argument('--lr-decay-style', type=str, default='linear',
+                       choices=['constant', 'linear', 'cosine'],
+                       help='Learning rate decay function.')
+    group.add_argument('--lr-decay-iters', type=int, default=None,
+                       help='number of iterations to decay learning rate over,'
+                       ' If None defaults to `--train-iters`')
+    group.add_argument('--lr-decay-samples', type=int, default=None,
+                       help='number of samples to decay learning rate over,'
+                       ' If None defaults to `--train-samples`')
+    group.add_argument('--lr-warmup-fraction', type=float, default=None,
+                       help='fraction of lr-warmup-(iters/samples) to use '
+                       'for warmup (as a float)')
+    group.add_argument('--lr-warmup-iters', type=int, default=0,
+                       help='number of iterations to linearly warmup '
+                       'learning rate over.')
+    group.add_argument('--lr-warmup-samples', type=int, default=0,
+                       help='number of samples to linearly warmup '
+                       'learning rate over.')
+    group.add_argument('--warmup', type=int, default=None,
+                       help='Old lr warmup argument, do not use. Use one of the'
+                       '--lr-warmup-* arguments above')
+    group.add_argument('--min-lr', type=float, default=0.0,
+                       help='Minumum value for learning rate. The scheduler'
+                       'clip values below this threshold.')
+    group.add_argument('--override-lr-scheduler', action='store_true',
+                       help='Reset the values of the scheduler (learning rate,'
+                       'warmup iterations, minimum learning rate, maximum '
+                       'number of iterations, and decay style from input '
+                       'arguments and ignore values from checkpoints. Note'
+                       'that all the above values will be reset.')
+    group.add_argument('--use-checkpoint-lr-scheduler', action='store_true',
+                       help='Use checkpoint to set the values of the scheduler '
+                       '(learning rate, warmup iterations, minimum learning '
+                       'rate, maximum number of iterations, and decay style '
+                       'from checkpoint and ignore input arguments.')
+
+    return parser
+
+
+def _add_checkpointing_args(parser):
+    group = parser.add_argument_group(title='checkpointing')
+
+    group.add_argument('--save', type=str, default=None,
+                       help='Output directory to save checkpoints to.')
+    group.add_argument('--save-interval', type=int, default=None,
+                       help='Number of iterations between checkpoint saves.')
+    group.add_argument('--no-save-optim', action='store_true', default=None,
+                       help='Do not save current optimizer.')
+    group.add_argument('--no-save-rng', action='store_true', default=None,
+                       help='Do not save current rng state.')
+    group.add_argument('--load', type=str, default=None,
+                       help='Directory containing a model checkpoint.')
+    group.add_argument('--no-load-optim', action='store_true', default=None,
+                       help='Do not load optimizer when loading checkpoint.')
+    group.add_argument('--no-load-rng', action='store_true', default=None,
+                       help='Do not load rng state when loading checkpoint.')
+    group.add_argument('--finetune', action='store_true',
+                       help='Load model for finetuning. Do not load optimizer '
+                       'or rng state from checkpoint and set iteration to 0. '
+                       'Assumed when loading a release checkpoint.')
+
+    return parser
+
+
+def _add_mixed_precision_args(parser):
+    group = parser.add_argument_group(title='mixed precision')
+
+    group.add_argument('--fp16', action='store_true',
+                       help='Run model in fp16 mode.')
+    group.add_argument('--bf16', action='store_true',
+                       help='Run model in bfloat16 mode.')
+    group.add_argument('--loss-scale', type=float, default=None,
+                       help='Static loss scaling, positive power of 2 '
+                       'values can improve fp16 convergence. If None, dynamic'
+                       'loss scaling is used.')
+    group.add_argument('--initial-loss-scale', type=float, default=2**32,
+                       help='Initial loss-scale for dynamic loss scaling.')
+    group.add_argument('--min-loss-scale', type=float, default=1.0,
+                       help='Minimum loss scale for dynamic loss scale.')
+    group.add_argument('--loss-scale-window', type=float, default=1000,
+                       help='Window over which to raise/lower dynamic scale.')
+    group.add_argument('--hysteresis', type=int, default=2,
+                       help='hysteresis for dynamic loss scaling')
+    group.add_argument('--fp32-residual-connection', action='store_true',
+                       help='Move residual connections to fp32.')
+    group.add_argument('--no-query-key-layer-scaling', action='store_false',
+                       help='Do not scale Q * K^T by 1 / layer-number.',
+                       dest='apply_query_key_layer_scaling')
+    group.add_argument('--attention-softmax-in-fp32', action='store_true',
+                       help='Run attention masking and softmax in fp32. '
+                       'This flag is ignored unless '
+                       '--no-query-key-layer-scaling is specified.')
+    group.add_argument('--accumulate-allreduce-grads-in-fp32',
+                       action='store_true',
+                       help='Gradient accumulation and all-reduce in fp32.')
+    group.add_argument('--fp16-lm-cross-entropy', action='store_true',
+                       help='Move the cross entropy unreduced loss calculation'
+                       'for lm head to fp16.')
+
+    return parser
+
+
+def _add_distributed_args(parser):
+    group = parser.add_argument_group(title='distributed')
+
+    group.add_argument('--tensor-model-parallel-size', type=int, default=1,
+                       help='Degree of tensor model parallelism.')
+    group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
+                       help='Degree of pipeline model parallelism.')
+    group.add_argument('--model-parallel-size', type=int, default=None,
+                       help='Old model parallel argument, do not use. Use '
+                       '--tensor-model-parallel-size instead.')
+    group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
+                       help='Number of layers per virtual pipeline stage')
+    group.add_argument('--distributed-backend', default='nccl',
+                       choices=['nccl', 'gloo'],
+                       help='Which backend to use for distributed training.')
+    group.add_argument('--DDP-impl', default='local',
+                       choices=['local', 'torch'],
+                       help='which DistributedDataParallel implementation '
+                       'to use.')
+    group.add_argument('--use-contiguous-buffers-in-ddp', action='store_true',
+                       help='If set, use contiguous buffer in DDP. Note that '
+                       'this option only works woth local DDP.' )
+    group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
+                       help='Use scatter/gather to optimize communication of tensors in pipeline',
+                       dest='scatter_gather_tensors_in_pipeline')
+    group.add_argument('--local_rank', type=int, default=None,
+                       help='local rank passed from distributed launcher.')
+    group.add_argument('--lazy-mpu-init', type=bool, required=False,
+                       help='If set to True, initialize_megatron() '
+                       'skips DDP initialization and returns function to '
+                       'complete it instead.Also turns on '
+                       '--use-cpu-initialization flag. This is for '
+                       'external DDP manager.' )
+    group.add_argument('--use-cpu-initialization', action='store_true',
+                       default=None, help='If set, affine parallel weights '
+                       'initialization uses CPU' )
+    group.add_argument('--empty-unused-memory-level', default=0, type=int,
+                       choices=[0, 1, 2],
+                       help='Call torch.cuda.empty_cache() each iteration '
+                       '(training and eval), to reduce fragmentation.'
+                       '0=off, 1=moderate, 2=aggressive.')
+    return parser
+
+
+def _add_validation_args(parser):
+    group = parser.add_argument_group(title='validation')
+
+    group.add_argument('--eval-iters', type=int, default=100,
+                       help='Number of iterations to run for evaluation'
+                       'validation/test for.')
+    group.add_argument('--eval-interval', type=int, default=1000,
+                       help='Interval between running evaluation on '
+                       'validation set.')
+
+    return parser
+
+
+def _add_data_args(parser):
+    group = parser.add_argument_group(title='data and dataloader')
+
+    group.add_argument('--data-path', nargs='*', default=None,
+                       help='Path to the training dataset. Accepted format:'
+                       '1) a single data path, 2) multiple datasets in the'
+                       'form: dataset1-weight dataset1-path dataset2-weight '
+                       'dataset2-path ...')
+    group.add_argument('--split', type=str, default='969, 30, 1',
+                       help='Comma-separated list of proportions for training,'
+                       ' validation, and test split. For example the split '
+                       '`90,5,5` will use 90%% of data for training, 5%% for '
+                       'validation and 5%% for test.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file.')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file.')
+    group.add_argument('--vocab-extra-ids', type=int, default=0,
+                       help='Number of additional vocabulary tokens. '
+                            'They are used for span masking in the T5 model')
+    group.add_argument('--seq-length', type=int, default=None,
+                       help='Maximum sequence length to process.')
+    group.add_argument('--encoder-seq-length', type=int, default=None,
+                       help='Maximum encoder sequence length to process.'
+                       'This should be exclusive of --seq-length')
+    group.add_argument('--decoder-seq-length', type=int, default=None,
+                       help="Maximum decoder sequence length to process.")
+    group.add_argument('--retriever-seq-length', type=int, default=256,
+                       help='Maximum sequence length for the biencoder model '
+                        ' for retriever')
+    group.add_argument('--sample-rate', type=float, default=1.0,
+                       help='sample rate for training data. Supposed to be 0 '
+                            ' < sample_rate < 1')
+    group.add_argument('--mask-prob', type=float, default=0.15,
+                       help='Probability of replacing a token with mask.')
+    group.add_argument('--short-seq-prob', type=float, default=0.1,
+                       help='Probability of producing a short sequence.')
+    group.add_argument('--mmap-warmup', action='store_true',
+                       help='Warm up mmap files.')
+    group.add_argument('--num-workers', type=int, default=2,
+                       help="Dataloader number of workers.")
+    group.add_argument('--tokenizer-type', type=str,
+                       default=None,
+                       choices=['BertWordPieceLowerCase',
+                                'BertWordPieceCase',
+                                'GPT2BPETokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--data-impl', type=str, default='infer',
+                       choices=['lazy', 'cached', 'mmap', 'infer'],
+                       help='Implementation of indexed datasets.')
+    group.add_argument('--reset-position-ids', action='store_true',
+                       help='Reset posistion ids after end-of-document token.')
+    group.add_argument('--reset-attention-mask', action='store_true',
+                       help='Reset self attention maske after '
+                       'end-of-document token.')
+    group.add_argument('--eod-mask-loss', action='store_true',
+                       help='Mask loss for the end of document tokens.')
+
+    return parser
+
+
+def _add_autoresume_args(parser):
+    group = parser.add_argument_group(title='autoresume')
+
+    group.add_argument('--adlr-autoresume', action='store_true',
+                       help='Enable autoresume on adlr cluster.')
+    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
+                       help='Intervals over which check for autoresume'
+                       'termination signal')
+
+    return parser
+
+
+def _add_biencoder_args(parser):
+    group = parser.add_argument_group(title='biencoder')
+
+    # network size
+    group.add_argument('--ict-head-size', type=int, default=None,
+                       help='Size of block embeddings to be used in ICT and '
+                        'REALM (paper default: 128)')
+    group.add_argument('--biencoder-projection-dim', type=int, default=0,
+                       help='Size of projection head used in biencoder (paper'
+                        ' default: 128)')
+    group.add_argument('--biencoder-shared-query-context-model', action='store_true',
+                        help='Whether to share the parameters of the query '
+                        'and context models or not')
+
+    # checkpointing
+    group.add_argument('--ict-load', type=str, default=None,
+                       help='Directory containing an ICTBertModel checkpoint')
+    group.add_argument('--bert-load', type=str, default=None,
+                       help='Directory containing an BertModel checkpoint '
+                       '(needed to start ICT and REALM)')
+
+    # data
+    group.add_argument('--titles-data-path', type=str, default=None,
+                       help='Path to titles dataset used for ICT')
+    group.add_argument('--query-in-block-prob', type=float, default=0.1,
+                       help='Probability of keeping query in block for '
+                       'ICT dataset')
+    group.add_argument('--use-one-sent-docs', action='store_true',
+                       help='Whether to use one sentence documents in ICT')
+    group.add_argument('--evidence-data-path', type=str, default=None,
+                       help='Path to Wikipedia Evidence frm DPR paper')
+
+    # training
+    group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
+                        default=[], help="Which top-k accuracies to report "
+                        "(e.g. '1 5 20')")
+    group.add_argument('--retriever-score-scaling', action='store_true',
+                       help='Whether to scale retriever scores by inverse '
+                        'square root of hidden size')
+
+    # faiss index
+    group.add_argument('--block-data-path', type=str, default=None,
+                       help='Where to save/load BlockData to/from')
+    group.add_argument('--embedding-path', type=str, default=None,
+                       help='Where to save/load Open-Retrieval Embedding'
+                        ' data to/from')
+
+    # indexer
+    group.add_argument('--indexer-batch-size', type=int, default=128,
+                       help='How large of batches to use when doing indexing '
+                       'jobs')
+    group.add_argument('--indexer-log-interval', type=int, default=1000,
+                       help='After how many batches should the indexer '
+                       'report progress')
+    return parser
+
+
+def _add_vit_args(parser):
+    group = parser.add_argument_group(title="vit")
+
+    group.add_argument('--num-classes', type=int, default=1000,
+                       help='num of classes in vision classificaiton task')
+    group.add_argument('--img-dim', type=int, default=224,
+                       help='Image size for vision classification task')
+    group.add_argument('--num-channels', type=int, default=3,
+                       help='Number of channels in input image data')
+    group.add_argument('--patch-dim', type=int, default=16,
+                       help='patch dimension used in vit')
+
+    return parser
--- a/apex/transformer/tensor_parallel/tests/commons.py
+++ b/apex/transformer/tensor_parallel/tests/commons.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+
+import numpy
+import torch
+
+from apex import transformer
+from apex.transformer.tensor_parallel.tests import global_vars
+
+
+TEST_SUCCESS_MESSAGE = ">> passed the test :-)"
+
+
+class IdentityLayer(torch.nn.Module):
+    def __init__(self, size, scale=1.0):
+        super(IdentityLayer, self).__init__()
+        self.weight = torch.nn.Parameter(scale * torch.randn(size))
+
+    def forward(self):
+        return self.weight
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducibility."""
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    transformer.tensor_parallel.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(backend='nccl'):
+    """Initialize torch.distributed."""
+    # Get local rank in case it is provided.
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument('--local_rank', type=int, default=None,
+    #                    help='local rank passed from distributed launcher')
+    # args = parser.parse_args()
+    args = global_vars.get_args()
+    local_rank = args.local_rank
+
+    # Get rank and world size.
+    rank = int(os.getenv('RANK', '0'))
+    world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    print('> initializing torch.distributed with local rank: {}, '
+          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
+
+    # Set the device id.
+    device = rank % torch.cuda.device_count()
+    if local_rank is not None:
+        device = local_rank
+    torch.cuda.set_device(device)
+
+    # Call the init process.
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
+
+
+def print_separator(message):
+    torch.distributed.barrier()
+    filler_len = (78 - len(message)) // 2
+    filler = '-' * filler_len
+    string = '\n' + filler + ' {} '.format(message) + filler
+    if torch.distributed.get_rank() == 0:
+        print(string, flush=True)
+    torch.distributed.barrier()
--- a/apex/transformer/tensor_parallel/tests/global_vars.py
+++ b/apex/transformer/tensor_parallel/tests/global_vars.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron global variables."""
+import os
+import sys
+import time
+
+import torch
+
+from apex.transformer.tensor_parallel.microbatches import build_num_microbatches_calculator
+from apex.transformer.tensor_parallel.tests.arguments import parse_args
+
+_GLOBAL_ARGS = None
+_GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+_GLOBAL_TOKENIZER = None
+_GLOBAL_TENSORBOARD_WRITER = None
+_GLOBAL_ADLR_AUTORESUME = None
+_GLOBAL_TIMERS = None
+
+
+def get_args():
+    """Return arguments."""
+    _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
+    return _GLOBAL_ARGS
+
+
+def get_num_microbatches():
+    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get()
+
+
+def get_current_global_batch_size():
+    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_global_batch_size()
+
+
+def update_num_microbatches(consumed_samples, consistency_check=True):
+    _GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples,
+                                               consistency_check)
+
+
+# def get_tokenizer():
+#     """Return tokenizer."""
+#     _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+#     return _GLOBAL_TOKENIZER
+
+
+def get_tensorboard_writer():
+    """Return tensorboard writer. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_TENSORBOARD_WRITER
+
+
+def get_adlr_autoresume():
+    """ADLR autoresume object. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_ADLR_AUTORESUME
+
+
+def get_timers():
+    """Return timers."""
+    _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
+    return _GLOBAL_TIMERS
+
+
+def set_global_variables(extra_args_provider=None, args_defaults={},
+                         ignore_unknown_args=False):
+    """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
+    args = _parse_args(extra_args_provider=extra_args_provider,
+                       defaults=args_defaults,
+                       ignore_unknown_args=ignore_unknown_args)
+    _build_num_microbatches_calculator(args)
+    # if args.vocab_file:
+    #     _ = _build_tokenizer(args)
+    _set_tensorboard_writer(args)
+    _set_adlr_autoresume(args)
+    _set_timers()
+
+
+def _parse_args(extra_args_provider=None, defaults={},
+                ignore_unknown_args=False):
+    """Parse entire arguments."""
+    global _GLOBAL_ARGS
+    _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
+    _GLOBAL_ARGS = parse_args(extra_args_provider=extra_args_provider,
+                              defaults=defaults,
+                              ignore_unknown_args=ignore_unknown_args)
+    return _GLOBAL_ARGS
+
+
+def _build_num_microbatches_calculator(args):
+
+    global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+    _ensure_var_is_not_initialized(_GLOBAL_NUM_MICROBATCHES_CALCULATOR,
+                                   'num microbatches calculator')
+
+    _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator(
+        args)
+
+
+# def _build_tokenizer(args):
+#     """Initialize tokenizer."""
+#     global _GLOBAL_TOKENIZER
+#     _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+#     _GLOBAL_TOKENIZER = build_tokenizer(args)
+#     return _GLOBAL_TOKENIZER
+
+
+# def rebuild_tokenizer(args):
+#     global _GLOBAL_TOKENIZER
+#     _GLOBAL_TOKENIZER = None
+#     return _build_tokenizer(args)
+
+
+def _set_tensorboard_writer(args):
+    """Set tensorboard writer."""
+    global _GLOBAL_TENSORBOARD_WRITER
+    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
+                                   'tensorboard writer')
+
+    if hasattr(args, 'tensorboard_dir') and \
+       args.tensorboard_dir and args.rank == (args.world_size - 1):
+        try:
+            from torch.utils.tensorboard import SummaryWriter
+            print('> setting tensorboard ...')
+            _GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
+                log_dir=args.tensorboard_dir,
+                max_queue=args.tensorboard_queue_size)
+        except ModuleNotFoundError:
+            print('WARNING: TensorBoard writing requested but is not '
+                  'available (are you using PyTorch 1.1.0 or later?), '
+                  'no TensorBoard logs will be written.', flush=True)
+
+
+def _set_adlr_autoresume(args):
+    """Initialize ADLR autoresume."""
+    global _GLOBAL_ADLR_AUTORESUME
+    _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
+
+    if args.adlr_autoresume:
+        if args.rank == 0:
+            print('enabling autoresume ...', flush=True)
+        sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
+        try:
+            from userlib.auto_resume import AutoResume
+        except BaseException:
+            print('ADLR autoresume is not available, exiting ...')
+            sys.exit()
+
+        _GLOBAL_ADLR_AUTORESUME = AutoResume
+
+
+def _set_timers():
+    """Initialize timers."""
+    global _GLOBAL_TIMERS
+    _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
+    _GLOBAL_TIMERS = Timers()
+
+
+def _ensure_var_is_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is not None, '{} is not initialized.'.format(name)
+
+
+def _ensure_var_is_not_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is None, '{} is already initialized.'.format(name)
+
+
+class _Timer:
+    """Timer."""
+
+    def __init__(self, name):
+        self.name_ = name
+        self.elapsed_ = 0.0
+        self.started_ = False
+        self.start_time = time.time()
+
+    def start(self):
+        """Start the timer."""
+        assert not self.started_, 'timer has already been started'
+        torch.cuda.synchronize()
+        self.start_time = time.time()
+        self.started_ = True
+
+    def stop(self):
+        """Stop the timer."""
+        assert self.started_, 'timer is not started'
+        torch.cuda.synchronize()
+        self.elapsed_ += (time.time() - self.start_time)
+        self.started_ = False
+
+    def reset(self):
+        """Reset timer."""
+        self.elapsed_ = 0.0
+        self.started_ = False
+
+    def elapsed(self, reset=True):
+        """Calculate the elapsed time."""
+        started_ = self.started_
+        # If the timing in progress, end it first.
+        if self.started_:
+            self.stop()
+        # Get the elapsed time.
+        elapsed_ = self.elapsed_
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if started_:
+            self.start()
+        return elapsed_
+
+
+class Timers:
+    """Group of timers."""
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = _Timer(name)
+        return self.timers[name]
+
+    def write(self, names, writer, iteration, normalizer=1.0, reset=False):
+        """Write timers to a tensorboard writer"""
+        # currently when using add_scalars,
+        # torch.utils.add_scalars makes each timer its own run, which
+        # polutes the runs list, so we just add each as a scalar
+        assert normalizer > 0.0
+        for name in names:
+            value = self.timers[name].elapsed(reset=reset) / normalizer
+            writer.add_scalar(name + '-time', value, iteration)
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        string = 'time (ms)'
+        for name in names:
+            elapsed_time = self.timers[name].elapsed(
+                reset=reset) * 1000.0 / normalizer
+            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_rank() == (
+                    torch.distributed.get_world_size() - 1):
+                print(string, flush=True)
+        else:
+            print(string, flush=True)
--- a/apex/transformer/tensor_parallel/utils.py
+++ b/apex/transformer/tensor_parallel/utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indecies in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size)
--- a/csrc/layer_norm_cuda.cpp
+++ b/csrc/layer_norm_cuda.cpp
@@ -130,12 +130,13 @@ std::vector<at::Tensor> layer_norm(
  int n1,n2;
  check_args(input,normalized_shape,n1,n2);
  at::Tensor output = at::empty_like(input);
-  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input.scalar_type()));
+  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type()==at::ScalarType::Half || input.scalar_type()==at::ScalarType::BFloat16 ? at::ScalarType::Float : input.scalar_type()));
  at::Tensor invvar = at::empty_like(mean);
  cuda_layer_norm(&output,&mean,&invvar,&input,n1,n2,
      normalized_shape,NULL,NULL,epsilon);
  return {output, mean, invvar};
 }
+
 std::vector<at::Tensor> layer_norm_affine(
    at::Tensor input,
    #ifdef VERSION_GE_1_1
@@ -152,13 +153,35 @@ std::vector<at::Tensor> layer_norm_affine(
  int n1,n2;
  check_args(input,normalized_shape,gamma,beta,n1,n2);
  at::Tensor output = at::empty_like(input);
-  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input.scalar_type()));
+  const auto stats_dtype = (input.scalar_type() == at::ScalarType::Half || input.scalar_type() == at::ScalarType::BFloat16) ? at::ScalarType::Float : input.scalar_type();
+  at::Tensor mean = at::empty({n1}, input.options().dtype(stats_dtype));
  at::Tensor invvar = at::empty_like(mean);
  cuda_layer_norm(&output,&mean,&invvar,&input,n1,n2,
      normalized_shape,&gamma,&beta,epsilon);
  return {output, mean, invvar};
 }

+std::vector<at::Tensor> layer_norm_affine_mixed_dtypes(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor gamma,
+    at::Tensor beta,
+    double epsilon) {
+  CHECK_INPUT(input);
+  int n1, n2;
+  check_args(input, normalized_shape, n1, n2);
+  at::Tensor output = at::empty_like(input, gamma.options().dtype(gamma.scalar_type()));
+  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type() == at::ScalarType::Half || input.scalar_type() == at::ScalarType::BFloat16 ? at::ScalarType::Float : input.scalar_type()));
+  at::Tensor invvar = at::empty_like(mean);
+   cuda_layer_norm(&output, &mean, &invvar, &input, n1, n2,
+      normalized_shape, &gamma, &beta, epsilon);
+  return {output, mean, invvar};
+}
+
 void cuda_layer_norm_gradient(
    at::Tensor* dout,
    at::Tensor* mean,
@@ -202,6 +225,7 @@ at::Tensor layer_norm_gradient(
      &grad_input,NULL,NULL);
  return grad_input;
 }
+
 std::vector<at::Tensor> layer_norm_gradient_affine(
    at::Tensor dout,
    at::Tensor mean,
@@ -237,5 +261,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &layer_norm, "LayerNorm forward (CUDA)");
  m.def("backward_affine", &layer_norm_gradient_affine, "LayerNorm backward (CUDA)");
  m.def("backward", &layer_norm_gradient, "LayerNorm backward (CUDA)");
+
+  m.def("forward_affine_mixed_dtypes", &layer_norm_affine_mixed_dtypes, "LayerNorm forward with mixed dtypes (CUDA) compatible with Megatron's implementation");
 }

--- a/csrc/layer_norm_cuda_kernel.cu
+++ b/csrc/layer_norm_cuda_kernel.cu
@@ -56,7 +56,7 @@ void cuWelfordMuSigma2(
  const int i1,
  U& mu,
  U& sigma2,
-  U* buf) 
+  U* buf)
 {
  // Assumptions:
  // 1) blockDim.x == warpSize
@@ -140,7 +140,7 @@ void cuWelfordMuSigma2(
  const int i1,
  float& mu,
  float& sigma2,
-  float* buf) 
+  float* buf)
 {
  // Assumptions:
  // 1) blockDim.x == warpSize
@@ -173,7 +173,7 @@ void cuWelfordMuSigma2(
      for (int k = 0;  k < 8;  k+=2) {
        float2 curr = __half22float2(*((__half2*)(lvals+l+k)));
        cuWelfordOnlineSum(curr.x,mu,sigma2,count);
-	cuWelfordOnlineSum(curr.y,mu,sigma2,count);
+        cuWelfordOnlineSum(curr.y,mu,sigma2,count);
      }
    }
    for (;  l < n2;  ++l) {
@@ -276,18 +276,18 @@ struct SharedMemory <double>
 };
 }

-template<typename T, typename U> __global__
-void cuApplyLayerNorm(
-  T* __restrict__ output_vals,
+template<typename T, typename U, typename V> __device__
+void cuApplyLayerNorm_(
+  V* __restrict__ output_vals,
  U* __restrict__ mean,
  U* __restrict__ invvar,
  const T* __restrict__ vals,
  const int n1,
  const int n2,
  const U epsilon,
-  const T* __restrict__ gamma,
-  const T* __restrict__ beta
-  ) 
+  const V* __restrict__ gamma,
+  const V* __restrict__ beta
+  )
 {
  // Assumptions:
  // 1) blockDim.x == warpSize
@@ -299,19 +299,19 @@ void cuApplyLayerNorm(
    U mu,sigma2;
    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf);
    const T* lvals = vals + i1*n2;
-    T* ovals = output_vals + i1*n2;
+    V* ovals = output_vals + i1*n2;
    U c_invvar = rsqrt(sigma2 + epsilon);
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    if (gamma != NULL && beta != NULL) {
      for (int i = thrx;  i < n2;  i+=numx) {
        U curr = static_cast<U>(lvals[i]);
-        ovals[i] = gamma[i] * static_cast<T>(c_invvar * (curr - mu)) + beta[i];
+        ovals[i] = gamma[i] * static_cast<V>(c_invvar * (curr - mu)) + beta[i];
      }
    } else {
      for (int i = thrx;  i < n2;  i+=numx) {
        U curr = static_cast<U>(lvals[i]);
-        ovals[i] = static_cast<T>(c_invvar * (curr - mu));
+        ovals[i] = static_cast<V>(c_invvar * (curr - mu));
      }
    }
    if (threadIdx.x == 0 && threadIdx.y == 0) {
@@ -321,7 +321,24 @@ void cuApplyLayerNorm(
  }
 }

-template<typename T, typename U> __device__
+template<typename T, typename U, typename V=T> __global__
+void cuApplyLayerNorm(
+  V* __restrict__ output_vals,
+  U* __restrict__ mean,
+  U* __restrict__ invvar,
+  const T* __restrict__ vals,
+  const int n1,
+  const int n2,
+  const U epsilon,
+  const V* __restrict__ gamma,
+  const V* __restrict__ beta
+  )
+{
+  cuApplyLayerNorm_<T, U, V>(output_vals, mean, invvar, vals, n1, n2, epsilon, gamma, beta);
+}
+
+
+template<typename T, typename U, typename V> __device__
 void cuLoadWriteStridedInputs(
    const int i1_block,
    const int thr_load_row_off,
@@ -331,7 +348,7 @@ void cuLoadWriteStridedInputs(
    U* warp_buf1,
    U* warp_buf2,
    const T* input,
-    const T* dout,
+    const V* dout,
    const int i1_end,
    const int n2,
    const U* __restrict__ mean,
@@ -348,9 +365,9 @@ void cuLoadWriteStridedInputs(
      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
      if (i2<n2) {
        U curr_input = static_cast<U>(input[load_idx]);
-	U curr_dout = static_cast<U>(dout[load_idx]);
-	warp_buf1[write_idx] = curr_dout;
-	warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_invvar;
+        U curr_dout = static_cast<U>(dout[load_idx]);
+        warp_buf1[write_idx] = curr_dout;
+        warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_invvar;
      } else {
        warp_buf1[write_idx] = U(0);
        warp_buf2[write_idx] = U(0);
@@ -365,7 +382,7 @@ void cuLoadWriteStridedInputs(
  }
 }

-template<typename T, typename U> __device__
+template<typename T, typename U, typename V> __device__
 void cuLoadAddStridedInputs(
    const int i1_block,
    const int thr_load_row_off,
@@ -375,7 +392,7 @@ void cuLoadAddStridedInputs(
    U* warp_buf1,
    U* warp_buf2,
    const T* input,
-    const T* dout,
+    const V* dout,
    const int i1_end,
    const int n2,
    const U* __restrict__ mean,
@@ -392,17 +409,17 @@ void cuLoadAddStridedInputs(
      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
      if (i2<n2) {
        U curr_input = static_cast<U>(input[load_idx]);
-	U curr_dout = static_cast<U>(dout[load_idx]);
-	warp_buf1[write_idx] += curr_dout;
-	warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_invvar;
+        U curr_dout = static_cast<U>(dout[load_idx]);
+        warp_buf1[write_idx] += curr_dout;
+        warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_invvar;
      }
    }
  }
 }

-template<typename T, typename U> __global__
+template<typename T, typename U, typename V> __global__
 void cuComputePartGradGammaBeta(
-    const T* __restrict__ dout,
+    const V* __restrict__ dout,
    const T* __restrict__ input,
    const int n1,
    const int n2,
@@ -449,11 +466,11 @@ void cuComputePartGradGammaBeta(
    for (int offset = blockDim.y/2;  offset > 1;  offset /= 2) {
      if (threadIdx.y < offset) {
        int row1 = threadIdx.y;
-	int row2 = threadIdx.y + offset;
-	int idx1 = row1*row_stride + threadIdx.x;
-	int idx2 = row2*row_stride + threadIdx.x;
-	warp_buf1[idx1] += warp_buf1[idx2];
-	warp_buf2[idx1] += warp_buf2[idx2];
+        int row2 = threadIdx.y + offset;
+        int idx1 = row1*row_stride + threadIdx.x;
+        int idx2 = row2*row_stride + threadIdx.x;
+        warp_buf1[idx1] += warp_buf1[idx2];
+        warp_buf2[idx1] += warp_buf2[idx2];
      }
      __syncthreads();
    }
@@ -468,19 +485,19 @@ void cuComputePartGradGammaBeta(
    }
 }

-template<typename T, typename U> __global__
+template<typename U, typename V> __global__
 void cuComputeGradGammaBeta(
    const U* part_grad_gamma,
    const U* part_grad_beta,
    const int part_size,
    const int n1,
    const int n2,
-    T* grad_gamma,
-    T* grad_beta)
+    V* grad_gamma,
+    V* grad_beta)
 {
    // sum partial gradients for gamma and beta
    SharedMemory<U> shared;
-    U* buf = shared.getPointer(); 
+    U* buf = shared.getPointer();
    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
    if (i2 < n2) {
      // each warp does sequential reductions until reduced part_size is num_warps
@@ -519,16 +536,16 @@ void cuComputeGradGammaBeta(
    }
 }

-template<typename T, typename U> __global__
+template<typename T, typename U, typename V> __global__
 void cuComputeGradInput(
-    const T* __restrict__ dout,
+    const V* __restrict__ dout,
    const T* __restrict__ input,
    const int n1,
    const int n2,
    const U* __restrict__ mean,
    const U* __restrict__ invvar,
    U epsilon,
-    const T* gamma,
+    const V* gamma,
    T* grad_input)
 {
  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
@@ -537,7 +554,7 @@ void cuComputeGradInput(
    const U c_mean = mean[i1];
    const U c_invvar = invvar[i1];
    const T* k_input = input + i1*n2;
-    const T* k_dout = dout + i1*n2;
+    const V* k_dout = dout + i1*n2;
    const int numx = blockDim.x * blockDim.y;
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
    if (gamma != NULL) {
@@ -581,7 +598,7 @@ void cuComputeGradInput(
    // inter-warp reductions
    if (blockDim.y > 1) {
      SharedMemory<U> shared;
-      U* buf = shared.getPointer(); 
+      U* buf = shared.getPointer();
      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
        // upper half of warps write to shared
        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
@@ -606,7 +623,7 @@ void cuComputeGradInput(
      if (threadIdx.y !=0) {
        sum_loss1 = buf[2*threadIdx.x];
        sum_loss2 = buf[2*threadIdx.x+1];
-      } 
+      }
    }
    // all threads now have the two sums over l
    U fH = (U)n2;
@@ -636,35 +653,29 @@ void cuComputeGradInput(
  }
 }

-template<typename T, typename U> 
+template<typename T, typename U, typename V=T>
 void HostApplyLayerNorm(
-    T* output,
+    V* output,
    U* mean,
    U* invvar,
    const T* input,
    int n1,
    int n2,
    double epsilon,
-    const T* gamma,
-    const T* beta
+    const V* gamma,
+    const V* beta
    )
 {
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    const dim3 threads(32,4,1);
    const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
    const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
-    int nshared = 
-        threads.y > 1 ? 
-	    threads.y*sizeof(U)+(threads.y/2)*sizeof(U) : 
+    int nshared =
+        threads.y > 1 ?
+	    threads.y*sizeof(U)+(threads.y/2)*sizeof(U) :
 	    0;
    cuApplyLayerNorm<<<blocks, threads, nshared, stream>>>(
-		    output,
-		    mean,
-		    invvar,
-		    input,
-		    n1,n2,
-		    U(epsilon),
-                    gamma,beta);
+      output, mean, invvar, input, n1, n2, U(epsilon), gamma, beta);
 }

 void cuda_layer_norm(
@@ -684,34 +695,35 @@ void cuda_layer_norm(
    double epsilon)
 {
    using namespace at;
-    DISPATCH_DOUBLE_FLOAT_AND_HALF(input->scalar_type(), 0, "layer_norm_cuda_kernel",
-        using accscalar_t = at::acc_type<scalar_t_0, true>;
-        HostApplyLayerNorm(
-            output->DATA_PTR<scalar_t_0>(),
-	    mean->DATA_PTR<accscalar_t>(),
-	    invvar->DATA_PTR<accscalar_t>(),
-	    input->DATA_PTR<scalar_t_0>(),
-	    n1,n2,
-	    epsilon,
-	    gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
-	    beta != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL);
+    DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
+        input->scalar_type(), output->scalar_type(), "layer_norm_cuda_kernel",
+        using accscalar_t = at::acc_type<scalar_t_in, true>;
+        HostApplyLayerNorm<scalar_t_in, accscalar_t, scalar_t_out>(
+          output->DATA_PTR<scalar_t_out>(),
+	      mean->DATA_PTR<accscalar_t>(),
+          invvar->DATA_PTR<accscalar_t>(),
+          input->DATA_PTR<scalar_t_in>(),
+          n1,n2,
+          epsilon,
+          gamma != NULL ? gamma->DATA_PTR<scalar_t_out>() : NULL,
+          beta != NULL ? beta->DATA_PTR<scalar_t_out>() : NULL);
      )
 }

-template<typename T, typename U> 
+template<typename T, typename U=float, typename V=T>
 void HostLayerNormGradient(
-    const T* dout,
+    const V* dout,
    const U* mean,
    const U* invvar,
    at::Tensor* input,
    int n1,
    int n2,
-    const T* gamma,
-    const T* beta,
+    const V* gamma,
+    const V* beta,
    double epsilon,
    T* grad_input,
-    T* grad_gamma,
-    T* grad_beta
+    V* grad_gamma,
+    V* grad_beta
    )
 {
    auto stream = at::cuda::getCurrentCUDAStream().stream();
@@ -724,7 +736,13 @@ void HostLayerNormGradient(
      const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
      const int nshared2_b = threads2.x * threads2.y * sizeof(U);
      const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
-      at::Tensor part_grad_gamma = at::empty({part_size,n2}, input->options().dtype(input->scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input->scalar_type()));
+      // note (mkozuki): I can hard code part_grad_gamma's dtype as float given that
+      // the `cuda_layer_norm_gradient` doesn't support double.
+      const auto part_grad_dtype =
+        (input->scalar_type() == at::ScalarType::Half || input->scalar_type() == at::ScalarType::BFloat16) ?
+        at::ScalarType::Float :
+        input->scalar_type();
+      at::Tensor part_grad_gamma = at::empty({part_size,n2}, input->options().dtype(part_grad_dtype));
      at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
      cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
 		      dout,
@@ -787,21 +805,23 @@ void cuda_layer_norm_gradient(
    at::Tensor* grad_beta)
 {
    using namespace at;
-    DISPATCH_FLOAT_AND_HALF(input->scalar_type(), 0, "cuComputeGradInput",
-        using accscalar_t = at::acc_type<scalar_t_0, true>;
-        HostLayerNormGradient(
-	    dout->DATA_PTR<scalar_t_0>(),
-	    mean->DATA_PTR<accscalar_t>(),
-	    invvar->DATA_PTR<accscalar_t>(),
-	    input,
-	    n1,n2,
+    // we can do away with `accscalar_t` as there're only three dtypes: fp32, fp16, bf16
+    DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
+      input->scalar_type(), gamma == NULL ? input->scalar_type() :  gamma->scalar_type(), "cuComputeGradInput",
+      using accscalar_t = at::acc_type<scalar_t_in, true>;
+      HostLayerNormGradient(
+        dout->DATA_PTR<scalar_t_out>(),
+        mean->DATA_PTR<accscalar_t>(),
+        invvar->DATA_PTR<accscalar_t>(),
+        input,
+        n1,n2,
            // TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
            // if gamma Tensor is NULL on input.
-	    gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
-	    gamma != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL,
-	    epsilon,
-	    grad_input->DATA_PTR<scalar_t_0>(),
-	    gamma != NULL ? grad_gamma->DATA_PTR<scalar_t_0>() : NULL,
-	    gamma != NULL ? grad_beta->DATA_PTR<scalar_t_0>() : NULL);
-      )
+        gamma != NULL ? gamma->DATA_PTR<scalar_t_out>() : NULL,
+        gamma != NULL ? beta->DATA_PTR<scalar_t_out>() : NULL,
+        epsilon,
+        grad_input->DATA_PTR<scalar_t_in>(),
+        gamma != NULL ? grad_gamma->DATA_PTR<scalar_t_out>() : NULL,
+        gamma != NULL ? grad_beta->DATA_PTR<scalar_t_out>() : NULL);
+    )
 }
--- a/csrc/megatron/scaled_masked_softmax.cpp
+++ b/csrc/megatron/scaled_masked_softmax.cpp
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    torch::Tensor const& mask,
+    float scale_factor);
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor);
+
+int get_batch_per_block_cuda(
+    int query_seq_len,
+    int key_seq_len,
+    int batches,
+    int attn_heads);
+
+torch::Tensor fwd(
+    torch::Tensor const& input,
+    torch::Tensor const& mask,
+    float scale_factor) {
+  AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
+  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
+	     (input.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+  AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
+
+  return fwd_cuda(input, mask, scale_factor);
+}
+
+torch::Tensor bwd(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor) {
+
+  AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
+
+  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
+	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
+	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+
+  return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+int get_batch_per_block(
+    int query_seq_len,
+    int key_seq_len,
+    int batches,
+    int attn_heads) {
+    return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
+}
+
+} // end namespace scaled_masked_softmax
+} // end namespace fused_softmax
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", 
+        &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
+	"Self Multihead Attention scaled, time masked softmax -- Forward.");
+
+  m.def("backward",
+        &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
+	"Self Multihead Attention scaled, time masked softmax -- Backward.");
+
+  m.def("get_batch_per_block",
+        &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
+        "Return Batch per block size."
+  );
+}
--- a/csrc/megatron/scaled_masked_softmax.h
+++ b/csrc/megatron/scaled_masked_softmax.h
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <c10/macros/Macros.h>
+
+namespace {
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src);
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); }
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half *dst, const c10::Half *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t *dst, const uint8_t *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); }
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Explicit masking
+ */	
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_forward(
+    output_t *dst, 
+    const input_t *src,
+    const uint8_t *mask, 
+    const acc_t scale, 
+    int micro_batch_size, 
+    int element_count,
+    int pad_batches) 
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
+    int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH;
+    int pad_first_batch = 0;
+    if (pad_batches != 1) { // bert style
+        pad_first_batch = (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH;
+    } else { // gpt2 style
+        pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+    }
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    dst += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    mask += pad_first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    input_t temp_data[ELEMENTS_PER_LDG_STG];
+    uint8_t temp_mask[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < batch_element_count) {
+                int itr_idx = i*element_count+it*WARP_SIZE;
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
+                copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(temp_mask, mask + itr_idx);
+
+                #pragma unroll
+                  for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                      if (temp_mask[element] != 1) {
+                          elements[i][it + element] = (acc_t)temp_data[element] * scale;
+                      } else {
+                          elements[i][it + element] = -10000.0;
+                      }
+                  }
+            } else {
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                }
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+            sum[i] += elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    output_t out[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = elements[i][it + element] / sum[i];
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);  
+            } else {
+                break;
+            } 
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_backward(
+    output_t *gradInput, 
+    input_t *grad, 
+    const input_t *output,
+    acc_t scale, 
+    int micro_batch_size, 
+    int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+    
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    input_t temp_grad[ELEMENTS_PER_LDG_STG];
+    input_t temp_output[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count + it * WARP_SIZE);
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count + it * WARP_SIZE);
+
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    output_reg[i][it + element] = (acc_t)temp_output[element];
+                }
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
+                }
+            } 
+        }
+    }
+   
+    acc_t sum[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        sum[i] = grad_reg[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            sum[i] += grad_reg[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                output_t out[ELEMENTS_PER_LDG_STG];
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count + it * WARP_SIZE, out);
+            } 
+        }
+    }
+}
+} // end of anonymous namespace
+
+int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads){
+    int log2_elements = log2_ceil(key_seq_len);
+    const int next_power_of_two = 1 << log2_elements;
+
+    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+    constexpr int threads_per_block = 128;
+    int warps_per_block = (threads_per_block / warp_size);
+    int batches_per_block = warps_per_block * batches_per_warp;
+
+    return batches_per_block;
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const uint8_t *mask,
+    const input_t scale, 
+    int query_seq_len, 
+    int key_seq_len, 
+    int batches,
+    int attn_heads,
+    int pad_batches)
+{
+    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 2048 );
+    if (key_seq_len == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(key_seq_len);
+        const int next_power_of_two = 1 << log2_elements;
+        int batch_count = batches * attn_heads * query_seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(query_seq_len%batches_per_block == 0);
+        dim3 blocks(query_seq_len/batches_per_block, attn_heads, batches);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 1: // 2
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 2: // 4
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 3: // 8
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 4: // 16
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 5: // 32
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 6: // 64
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 7: // 128
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 8: // 256
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 9: // 512
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 10: // 1024
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 11: // 2048
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_backward(
+    output_t *grad_input, 
+    input_t *grad, 
+    const input_t *output, 
+    const acc_t scale, 
+    int query_seq_len, 
+    int key_seq_len, 
+    int batches,
+    int attn_heads)
+{
+    TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 2048 );
+    if (key_seq_len == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(key_seq_len);
+        const int next_power_of_two = 1 << log2_elements;
+        int batch_count = batches *  attn_heads * query_seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = batch_count/batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 1: // 2
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 2: // 4
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 3: // 8
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 4: // 16
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 5: // 32
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 6: // 64
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 7: // 128
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 8: // 256
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 9: // 512
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 10: // 1024
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 11: // 2048
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            default:
+                break;
+        }
+    }
+}
--- a/csrc/megatron/scaled_masked_softmax_cuda.cu
+++ b/csrc/megatron/scaled_masked_softmax_cuda.cu
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "scaled_masked_softmax.h"
+#include "type_shim.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){
+    return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
+}
+
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input,
+    torch::Tensor const& mask,
+    float scale_factor)
+{
+  // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+  const int batches = input.size(0);
+  const int pad_batches = mask.size(0);
+  const int attn_heads = input.size(1);
+  const int query_seq_len = input.size(2);
+  const int key_seq_len = input.size(3);
+  TORCH_INTERNAL_ASSERT(key_seq_len <= 2048);
+  TORCH_INTERNAL_ASSERT(query_seq_len > 1);
+  TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
+  TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
+  TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
+  TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
+
+  // Output 
+  auto act_options = input.options().requires_grad(false);
+  torch::Tensor softmax_results = 
+      torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
+
+  // Softmax Intermediate Result Ptr
+  void* input_ptr = static_cast<void*>(input.data_ptr());
+  void* mask_ptr = static_cast<void*>(mask.data_ptr());
+  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+  DISPATCH_HALF_AND_BFLOAT(
+      input.scalar_type(),
+      "dispatch_scaled_masked_softmax_forward",
+      dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
+          reinterpret_cast<scalar_t*>(softmax_results_ptr),
+	  reinterpret_cast<const scalar_t*>(input_ptr),
+	  reinterpret_cast<const uint8_t*>(mask_ptr),
+	  scale_factor,
+	  query_seq_len,
+	  key_seq_len,
+	  batches,
+	  attn_heads,
+	  pad_batches);
+      );
+  return softmax_results;
+}
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads_, 
+    torch::Tensor const& softmax_results_, 
+    float scale_factor)  {
+	
+  auto output_grads = output_grads_.contiguous();
+  auto softmax_results = softmax_results_.contiguous();
+
+  //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+  const int batches = output_grads.size(0);
+  const int attn_heads = output_grads.size(1);
+  const int query_seq_len = output_grads.size(2);
+  const int key_seq_len = output_grads.size(3);
+
+  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+  //Softmax Grad
+  DISPATCH_HALF_AND_BFLOAT(
+      output_grads_.scalar_type(),
+      "dispatch_scaled_masked_softmax_backward",
+      dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
+          reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
+	  scale_factor,
+	  query_seq_len,
+	  key_seq_len,
+	  batches,
+	  attn_heads);
+			   );
+  
+  //backward pass is completely in-place
+  return output_grads;
+}
+}
+}
+}
--- a/csrc/megatron/scaled_upper_triang_masked_softmax.cpp
+++ b/csrc/megatron/scaled_upper_triang_masked_softmax.cpp
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    float scale_factor);
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor);
+
+torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
+  AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
+	     (input.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+
+  return fwd_cuda(input, scale_factor);
+}
+
+torch::Tensor bwd(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor) {
+
+  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
+
+  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
+	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
+	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+
+  return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+} // end namespace scaled_upper_triang_masked_softmax
+} // end namespace fused_softmax
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", 
+        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
+	"Self Multihead Attention scaled, time masked softmax -- Forward.");
+  m.def("backward", 
+        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
+	"Self Multihead Attention scaled, time masked softmax -- Backward.");
+}
--- a/csrc/megatron/scaled_upper_triang_masked_softmax.h
+++ b/csrc/megatron/scaled_upper_triang_masked_softmax.h
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <c10/macros/Macros.h>
+
+namespace {
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src);
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); }
+  
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half *dst, const c10::Half *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t *dst, const uint8_t *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); }
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_zero_vector(Datatype *dst);
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::BFloat16, 1>(c10::BFloat16 *dst) { *dst = 0.0; }
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::BFloat16, 4>(c10::BFloat16 *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); }
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::Half, 1>(c10::Half *dst) { *dst = 0.0; }
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::Half, 4>(c10::Half *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); }
+
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Implicit time (diagonal masking)
+ */
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const acc_t scale, 
+    int micro_batch_size, 
+    int stride, 
+    int element_count) 
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1; 
+    int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1)/ WARP_SIZE;
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+    dst += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    input_t temp_data[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + i*element_count*stride + it*WARP_SIZE);
+
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if ((element_index + element) < batch_element_count) {
+                        elements[i][it+element] = (acc_t)temp_data[element] * scale;
+                    } else {
+                        elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                    }
+                }
+            } else {
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                }
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (it < warp_iteration_limit) {
+                elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+                sum[i] += elements[i][it];
+            } 
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    output_t out[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < local_seq) {
+
+                #pragma unroll  
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < local_seq) {
+                        out[element] = elements[i][it + element] / sum[i];
+                    } else {
+                        out[element] = 0;
+                    }
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE, out);
+            } else if (element_index < element_count) {
+                copy_zero_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE);
+            } else {
+                break;
+            } 
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_backward(
+    output_t *gradInput, 
+    input_t *grad, 
+    const input_t *output,
+    acc_t scale, 
+    int micro_batch_size, 
+    int stride, 
+    int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1; 
+    
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    input_t temp_grad[ELEMENTS_PER_LDG_STG];
+    input_t temp_output[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count * stride + it * WARP_SIZE);
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count * stride + it * WARP_SIZE);
+
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < batch_element_count) {
+                        output_reg[i][it + element] = (acc_t)temp_output[element];
+                    }
+                }
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < batch_element_count) {
+                        grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
+                    }
+                }
+            }
+        }
+    }
+   
+    acc_t sum[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        sum[i] = grad_reg[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            sum[i] += grad_reg[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                output_t out[ELEMENTS_PER_LDG_STG];
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count * stride + it * WARP_SIZE, out);
+            } 
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const input_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int attn_batches)
+{
+    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 );
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1: // 2
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2: // 4
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3: // 8
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4: // 16
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5: // 32
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6: // 64
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7: // 128
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8: // 256
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9: // 512
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10: // 1024
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11: // 2048
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_backward(
+    output_t *grad_input, 
+    input_t *grad, 
+    const input_t *output, 
+    const acc_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int attn_batches)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 );
+    if (softmax_elements == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1: // 2
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2: // 4
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3: // 8
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4: // 16
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5: // 32
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6: // 64
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7: // 128
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8: // 256
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9: // 512
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10: // 1024
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11: // 2048
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default:
+                break;
+        }
+    }
+}
--- a/csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "scaled_upper_triang_masked_softmax.h"
+#include "type_shim.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    float scale_factor)
+{
+  // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+  const int attn_batches = input.size(0);
+  const int seq_len = input.size(1);
+  TORCH_INTERNAL_ASSERT(seq_len <= 2048);
+
+  // Output 
+  auto act_options = input.options().requires_grad(false);
+  torch::Tensor softmax_results = 
+      torch::empty({attn_batches, seq_len, seq_len}, act_options);
+
+  // Softmax Intermediate Result Ptr
+  void* input_ptr = static_cast<void*>(input.data_ptr());
+  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+  DISPATCH_HALF_AND_BFLOAT(
+      input.scalar_type(),
+      "dispatch_scaled_upper_triang_masked_softmax_forward",
+      dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
+	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
+	  reinterpret_cast<const scalar_t*>(input_ptr),
+	  scale_factor,
+	  seq_len,
+	  seq_len,
+	  attn_batches);
+      );
+  return softmax_results;
+}
+				      
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads_, 
+    torch::Tensor const& softmax_results_, 
+    float scale_factor)  {
+	
+  auto output_grads = output_grads_.contiguous();
+  auto softmax_results = softmax_results_.contiguous();
+
+  //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+  const int attn_batches = output_grads.size(0);
+  const int seq_len = output_grads.size(1);
+  TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
+
+  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+  //Softmax Grad
+  DISPATCH_HALF_AND_BFLOAT(
+      output_grads_.scalar_type(),
+      "dispatch_scaled_upper_triang_masked_softmax_backward",
+      dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
+          reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
+	  scale_factor,
+	  seq_len,
+	  seq_len,
+	  attn_batches);
+      );
+  
+  //backward pass is completely in-place
+  return output_grads;
+}
+}
+}
+}
--- a/csrc/type_shim.h
+++ b/csrc/type_shim.h
@@ -34,6 +34,32 @@
  }


+#define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Half: \
+    { \
+      using scalar_t_##LEVEL = at::Half; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::BFloat16: \
+    { \
+      using scalar_t_##LEVEL = at::BFloat16; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+
+
 #define DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, LEVEL, NAME, ...) \
  switch(TYPE) \
  { \
@@ -106,6 +132,160 @@
  }


+  #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
+  switch(TYPE)								\
+    {									\
+    case at::ScalarType::Half:						\
+      {									\
+	using scalar_t = at::Half;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::BFloat16:					\
+      {									\
+	using scalar_t = at::BFloat16;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    default:								\
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
+  }
+
+
+  #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
+  switch(TYPEIN)							\
+    {									\
+    case at::ScalarType::Float:						\
+      {									\
+	using scalar_t_in = float;					\
+	switch(TYPEOUT)							\
+	  {								\
+	  case at::ScalarType::Float:					\
+	    {								\
+	      using scalar_t_out = float;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::Half:					\
+	    {								\
+	      using scalar_t_out = at::Half;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::BFloat16:				\
+	    {								\
+	      using scalar_t_out = at::BFloat16;			\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  default:							\
+	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
+	  }								\
+	break;								\
+      }									\
+    case at::ScalarType::Half:						\
+      {									\
+	using scalar_t_in = at::Half;					\
+	using scalar_t_out = at::Half;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::BFloat16:					\
+      {									\
+	using scalar_t_in = at::BFloat16;				\
+	using scalar_t_out = at::BFloat16;				\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    default:								\
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
+    }
+
+
+  #define DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
+  switch(TYPEIN)							\
+    {									\
+    case at::ScalarType::Double:						\
+      {									\
+	using scalar_t_in = double;					\
+	switch(TYPEOUT)							\
+	  {								\
+	  case at::ScalarType::Double:					\
+	    {								\
+	      using scalar_t_out = double;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::Float:					\
+	    {								\
+	      using scalar_t_out = float;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::Half:					\
+	    {								\
+	      using scalar_t_out = at::Half;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::BFloat16:				\
+	    {								\
+	      using scalar_t_out = at::BFloat16;			\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  default:							\
+	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
+	  }								\
+	break;								\
+      }									\
+    case at::ScalarType::Float:						\
+      {									\
+	using scalar_t_in = float;					\
+	switch(TYPEOUT)							\
+	  {								\
+	  case at::ScalarType::Float:					\
+	    {								\
+	      using scalar_t_out = float;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::Half:					\
+	    {								\
+	      using scalar_t_out = at::Half;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::BFloat16:				\
+	    {								\
+	      using scalar_t_out = at::BFloat16;			\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  default:							\
+	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
+	  }								\
+	break;								\
+      }									\
+    case at::ScalarType::Half:						\
+      {									\
+	using scalar_t_in = at::Half;					\
+	using scalar_t_out = at::Half;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::BFloat16:					\
+      {									\
+	using scalar_t_in = at::BFloat16;				\
+	using scalar_t_out = at::BFloat16;				\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    default:								\
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
+    }
+
+
 template<typename T>
 __device__ __forceinline__ T reduce_block_into_lanes
  (T *x,

--- a/setup.py
+++ b/setup.py
@@ -206,6 +206,30 @@ if "--cuda_ext" in sys.argv:
                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
                                              'nvcc':['-O3'] + version_dependent_macros}))

+        ext_modules.append(
+            CUDAExtension(name='scaled_upper_triang_masked_softmax_cuda',
+                          sources=['csrc/megatron/scaled_upper_triang_masked_softmax.cpp',
+                                   'csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu'],
+                          include_dirs=[os.path.join(this_dir, 'csrc')],
+                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
+                                              'nvcc':['-O3',
+                                                      '-U__CUDA_NO_HALF_OPERATORS__',
+                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
+                                                      '--expt-relaxed-constexpr',
+                                                      '--expt-extended-lambda'] + version_dependent_macros}))
+
+        ext_modules.append(
+            CUDAExtension(name='scaled_masked_softmax_cuda',
+                          sources=['csrc/megatron/scaled_masked_softmax.cpp',
+                                   'csrc/megatron/scaled_masked_softmax_cuda.cu'],
+                          include_dirs=[os.path.join(this_dir, 'csrc')],
+                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
+                                              'nvcc':['-O3',
+                                                      '-U__CUDA_NO_HALF_OPERATORS__',
+                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
+                                                      '--expt-relaxed-constexpr',
+                                                      '--expt-extended-lambda'] + version_dependent_macros}))
+
 if "--bnp" in sys.argv:
    sys.argv.remove("--bnp")

@@ -495,6 +519,7 @@ if "--fast_bottleneck" in sys.argv:
                          include_dirs=[os.path.join(this_dir, 'apex/contrib/csrc/cudnn-frontend/include')],
                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros + generator_flag}))

+
 setup(
    name='apex',
    version='0.1',

--- a/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
+++ b/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
+import itertools
 import unittest
-import os
-import random

 import torch
+
 import apex
-from torch.autograd import Variable

-        
+
 class TestFusedLayerNorm(unittest.TestCase):
+    dtype = torch.float
+    elementwise_affine = False
+    normalized_shape = [32, 16]
+    rtol, atol = None, None
+    fwd_thresholds = dict(rtol=None, atol=None)
+    bwd_thresholds = dict(rtol=None, atol=None)
+
    def setUp(self):
        # bias and weight are set to 0 and 1 respectively, so no need to copy parameters from cpu module to the gpu one
-        self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=False).cpu()
-        self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=False).cuda()
+        self.module_cpu_ = apex.normalization.FusedLayerNorm(
+            normalized_shape=self.normalized_shape, elementwise_affine=self.elementwise_affine).cpu()
+        self.module_cuda_ = apex.normalization.FusedLayerNorm(
+            normalized_shape=self.normalized_shape, elementwise_affine=self.elementwise_affine).to(device="cuda", dtype=self.dtype)

-    def _test_same_output(self, batch_size):
+    def _check_same_output(self, batch_size, contiguous):
        torch.cuda.manual_seed(42)
-        self.input_ = torch.randn((batch_size, *self.module_cpu_.normalized_shape), device="cpu").requires_grad_(True)
-        self.input_cuda_ = self.input_.cuda().detach().requires_grad_(True)
-        out_cpu_ = self.module_cpu_(self.input_)
+        if contiguous:
+            input_shape = [batch_size] + self.normalized_shape
+            input_ = torch.randn(input_shape, device="cpu").requires_grad_(True)
+            input_cuda_ = input_.to(device="cuda", dtype=self.dtype).detach().requires_grad_(True)
+            self.assertTrue(input_.is_contiguous())
+            self.assertTrue(input_cuda_.is_contiguous())
+        else:
+            input_shape = [batch_size] + self.normalized_shape
+            input_shape = [batch_size * 3] + [self.normalized_shape[0] * 5, self.normalized_shape[1] * 3]
+            input_src_ = torch.randn(input_shape, device="cpu")
+            input_ = input_src_[::3, ::5, ::3].detach().requires_grad_(True)
+            input_cuda_ = input_src_.to(device="cuda", dtype=self.dtype)[::3, ::5, ::3].detach().requires_grad_(True)
+            # make sure that tensors are NOT contiguous.
+            self.assertFalse(input_.is_contiguous())
+            self.assertFalse(input_cuda_.is_contiguous())
+        out_cpu_ = self.module_cpu_(input_)
        gO = torch.rand_like(out_cpu_)
        out_cpu_.backward(gO)
-        out_cuda_ = self.module_cuda_(self.input_cuda_)
-        gO = gO.cuda()
+        out_cuda_ = self.module_cuda_(input_cuda_)
+        gO = gO.to(device="cuda", dtype=self.dtype)
        out_cuda_.backward(gO)
-        assert out_cpu_.is_cuda == False
-        assert out_cuda_.is_cuda == True
-        torch.testing.assert_allclose(out_cpu_, out_cuda_.cpu())
-        torch.testing.assert_allclose(self.input_.grad, self.input_cuda_.grad.cpu())
+        self.assertFalse(out_cpu_.is_cuda)
+        self.assertTrue(out_cuda_.is_cuda)
+        # TODO (mkozuki): `torch.testing.assert_allclose` is deprecated.
+        # Use `torch.testing.assert_close`.
+        # See https://github.com/pytorch/pytorch/issues/61844
+        torch.testing.assert_allclose(
+            out_cpu_.to(device="cuda", dtype=self.dtype), out_cuda_, **self.fwd_thresholds)
+        torch.testing.assert_allclose(
+            input_.grad.to(device="cuda", dtype=self.dtype), input_cuda_.grad, **self.bwd_thresholds)
+
+    def _test_same_output(self, batch_size):
+        for contiguous in (True, False):
+            with self.subTest(contiguous=contiguous):
+                self._check_same_output(batch_size, contiguous)

    def test_layer_norm(self):
        self._test_same_output(16)

    def test_large_batch(self):
        self._test_same_output(65536)
-        
-        
+
+
 class TestFusedLayerNormElemWise(TestFusedLayerNorm):
+    elementwise_affine = True
+
+
+class TestFusedLayerNormElemWiseHalf(TestFusedLayerNormElemWise):
+    dtype = torch.half
+
+    def test_large_batch(self):
+        self.skipTest("Skip to save time")
+
+
+# Megatron style Layer Norm
+class TestFusedLayerNormElemWiseMixedDtypes(TestFusedLayerNorm):
    def setUp(self):
-        self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cpu()
-        self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cuda()
+        self.module_cpu_ = apex.normalization.MixedFusedLayerNorm(
+            normalized_shape=self.normalized_shape, elementwise_affine=True).cpu()
+        self.module_cuda_ = apex.normalization.MixedFusedLayerNorm(
+            normalized_shape=self.normalized_shape, elementwise_affine=True).to(device="cuda", dtype=self.dtype)
+
+    def test_init_exception(self):
+        with self.assertRaisesRegex(RuntimeError, "MixedFusedLayerNorm does not support `elementwise_affine = False`"):
+            apex.normalization.MixedFusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=False).cuda()
+
+
+class TestFusedLayerNormElemWiseMixedDtypesHalf(TestFusedLayerNormElemWiseMixedDtypes):
+    dtype = torch.half
+
+    def test_large_batch(self):
+        self.skipTest("Skip to save time")
+
+
+# NOTE (mkozuki): With the larger threshold values, still flaky.
+class TestFusedLayerNormElemWiseMixedDtypesBFloat16(TestFusedLayerNormElemWiseMixedDtypesHalf):
+    dtype = torch.bfloat16
+    # NOTE (mkozuki): [BFloat16 Layer Norm flakiness]
+    # Use thresholds larger than those used in pytorch, see
+    # https://github.com/pytorch/pytorch/blob/72274e2a2fd55019ec860e1743dbdc5b0c5a5624/torch/testing/_asserts.py#L26
+    fwd_thresholds = dict(rtol=1.6e-2, atol=3e-4)
+    bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3)
+
+
+class TestFusedLayerNormElemWiseBFloat16(TestFusedLayerNormElemWise):
+    dtype = torch.bfloat16
+    # See [BFloat16 Layer Norm flakiness]
+    fwd_thresholds = dict(rtol=1.6e-2, atol=3e-4)
+    bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3)
+
+    def test_large_batch(self):
+        self.skipTest("Skip to save time")
+
+
+def _prep_layers(normalized_shape, elementwise_affine, dtype):
+    native = torch.nn.LayerNorm(
+        normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
+    ).to(device="cuda", dtype=dtype)
+    fused = apex.normalization.FusedLayerNorm(
+        normalized_shape=normalized_shape, elementwise_affine=elementwise_affine
+    ).cuda()
+    return native, fused
+
+
+def _prep_inputs(batch_size, normalized_shape, dtype):
+    shape = (batch_size, *normalized_shape)
+    fused = torch.randn(shape).cuda().requires_grad_(True)
+    with torch.no_grad():
+        native = fused.clone().to(dtype).requires_grad_(True)
+    return native, fused
+
+
+autocast_dtypes = (torch.half, torch.bfloat16) if torch.cuda.is_bf16_supported() else (torch.half,)
+
+
+class TestAutocastFusedLayerNorm(unittest.TestCase):
+    bf16_fwd_thresholds = dict(rtol=1.6e-2, atol=3e-4)
+    bf16_bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3)
+
+    def setUp(self):
+        self.batch_size = 16
+        self.normalized_shape = [32, 16]
+
+    def _run_test(self, dtype, elementwise_affine):
+        native, fused = _prep_layers(self.normalized_shape, elementwise_affine, dtype)
+        native_x, fused_x = _prep_inputs(self.batch_size, self.normalized_shape, dtype)
+
+        expected = native(native_x)
+        with torch.cuda.amp.autocast(dtype=dtype):
+            actual = fused(fused_x)
+        tols = {'rtol': None, 'atol': None} if dtype == torch.half else TestAutocastFusedLayerNorm.bf16_fwd_thresholds
+        torch.testing.assert_allclose(actual, expected, **tols)
+
+        g_native = torch.rand_like(expected)
+        with torch.no_grad():
+            g_fused = g_native.clone()
+        expected.backward(g_native)
+        actual.backward(g_fused)
+
+        tols = {'rtol': None, 'atol': None} if dtype == torch.half else TestAutocastFusedLayerNorm.bf16_bwd_thresholds
+        torch.testing.assert_allclose(native_x.grad, fused_x.grad, **tols)

+    def test_autocast(self):
+        for (dtype, elementwise_affine) in itertools.product(autocast_dtypes, (True, False)):
+            with self.subTest(f"{dtype}-{elementwise_affine}"):
+                self._run_test(dtype, elementwise_affine)
--- a/tests/L0/run_test.py
+++ b/tests/L0/run_test.py
+"""L0 Tests Runner.
+
+How to run this script?
+
+1. Run all the tests: `python /path/to/apex/tests/L0/run_test.py`
+2. Run one of the tests (e.g. fused layer norm):
+    `python /path/to/apex/tests/L0/run_test.py --include run_fused_layer_norm`
+3. Run two or more of the tests (e.g. optimizers and fused layer norm):
+    `python /path/to/apex/tests/L0/run_test.py --include run_optimizers run_fused_layer_norm`
+"""
+import argparse
+import os
 import unittest
 import sys

-test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]

-runner = unittest.TextTestRunner(verbosity=2)
+TEST_ROOT = os.path.dirname(os.path.abspath(__file__))
+TEST_DIRS = [
+    "run_amp",
+    "run_fp16util",
+    "run_optimizers",
+    "run_fused_layer_norm",
+    "run_pyprof_nvtx",
+    "run_pyprof_data",
+    "run_mlp",
+    "run_transformer",
+]
+DEFAULT_TEST_DIRS = [
+    "run_optimizers",
+    "run_fused_layer_norm",
+    "run_mlp",
+    "run_transformer",
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="L0 test runner",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--include",
+        nargs="+",
+        choices=TEST_DIRS,
+        default=DEFAULT_TEST_DIRS,
+        help="select a set of tests to run (defaults to ALL tests).",
+    )
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def main(args):
+    runner = unittest.TextTestRunner(verbosity=2)
+    errcode = 0
+    for test_dir in args.include:
+        test_dir = os.path.join(TEST_ROOT, test_dir)
+        print(test_dir)
+        suite = unittest.TestLoader().discover(test_dir)

-errcode = 0
+        print("\nExecuting tests from " + test_dir)

-for test_dir in test_dirs:
-    suite = unittest.TestLoader().discover(test_dir)
+        result = runner.run(suite)

-    print("\nExecuting tests from " + test_dir)
+        if not result.wasSuccessful():
+            errcode = 1

-    result = runner.run(suite)
+    sys.exit(errcode)

-    if not result.wasSuccessful():
-        errcode = 1

-sys.exit(errcode)
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
--- a/tests/L0/run_transformer/__init__.py
+++ b/tests/L0/run_transformer/__init__.py
--- a/tests/L0/run_transformer/run_cross_entropy_test.py
+++ b/tests/L0/run_transformer/run_cross_entropy_test.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+
+from apex.transformer.tensor_parallel.tests.commons import set_random_seed
+from apex.transformer.tensor_parallel.tests.commons import IdentityLayer
+from apex.transformer.tensor_parallel.tests.commons import print_separator
+from apex.transformer.tensor_parallel.tests.commons import initialize_distributed
+from apex.transformer.tensor_parallel.tests.commons import TEST_SUCCESS_MESSAGE
+from apex.transformer import parallel_state
+from apex.transformer import tensor_parallel
+from apex.transformer.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
+from apex.transformer.tensor_parallel.tests import global_vars
+
+
+global_vars.set_global_variables()
+
+
+def torch_cross_entropy(batch_size, seq_length, vocab_size,
+                        logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
+                           target.view(-1),
+                           reduction='none').view_as(target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size), scale=logits_scale).cuda()
+    logits = identity()
+    logits_parallel = tensor_parallel.scatter_to_tensor_model_parallel_region(logits)
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def test_cross_entropy(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cross entropy with model parallel size {} ...'.
+              format(tensor_model_parallel_size))
+
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size)
+    tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
+
+    batch_size = 13
+    seq_length = 17
+    vocab_size_per_partition = 11
+    logits_scale = 1000.0
+    vocab_size = vocab_size_per_partition * tensor_model_parallel_size
+    seed = 1234
+
+    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed)
+    loss_mpu, grad_mpu = tensor_sharded_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed)
+
+    error = loss_torch.sub_(loss_mpu).abs().max()
+    print('   max error in loss on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = grad_torch.sub_(grad_mpu).abs().max()
+    print('   max error in grad on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    parallel_state.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(TEST_SUCCESS_MESSAGE)
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test cross entropy')
+        test_cross_entropy(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
--- a/tests/L0/run_transformer/run_data_test.py
+++ b/tests/L0/run_transformer/run_data_test.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import operator
+
+import torch
+
+from apex.transformer import parallel_state
+from apex.transformer.tensor_parallel import data as data_utils
+from apex.transformer.tensor_parallel.tests import global_vars
+from apex.transformer.tensor_parallel.tests.commons import print_separator
+from apex.transformer.tensor_parallel.tests.commons import initialize_distributed
+from apex.transformer.tensor_parallel.tests.commons import TEST_SUCCESS_MESSAGE
+
+global_vars.set_global_variables()
+
+
+def test_broadcast_data(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing broadcast_data with model parallel size {} ...'.
+              format(tensor_model_parallel_size))
+
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size)
+    torch.manual_seed(1234 + parallel_state.get_data_parallel_rank())
+    tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
+
+    key_size_t = {
+        'key1': [7, 11],
+        'key2': [8, 2, 1],
+        'key3': [13],
+        'key4': [5, 1, 2],
+        'key5': [5, 12],
+    }
+    keys = list(key_size_t.keys())
+
+    data = {}
+    data_t = {}
+    for key in key_size_t:
+        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
+        data_t[key] = data[key].clone()
+    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
+    data_t['keyX'] = data['keyX'].clone()
+    if parallel_state.get_tensor_model_parallel_rank() != 0:
+        data = None
+
+    data_utils._check_data_types(keys, data_t, torch.int64)
+    key_size, key_numel, \
+        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
+    for key in keys:
+        assert key_size[key] == key_size_t[key]
+    total_numel_t = 0
+    for key in keys:
+        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
+        assert key_numel[key] == target_size
+        total_numel_t += target_size
+    assert total_numel == total_numel_t
+
+    data_b = data_utils.broadcast_data(keys, data, torch.int64)
+    for key in keys:
+        tensor = data_t[key].cuda()
+        assert data_b[key].sub(tensor).abs().max() == 0
+
+    # Reset groups
+    parallel_state.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(TEST_SUCCESS_MESSAGE)
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test test broadcast data')
+        test_broadcast_data(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
--- a/tests/L0/run_transformer/run_initialize_test.py
+++ b/tests/L0/run_transformer/run_initialize_test.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from apex.transformer import parallel_state
+from apex.transformer.tensor_parallel.tests import global_vars
+from apex.transformer.tensor_parallel.tests.commons import print_separator
+from apex.transformer.tensor_parallel.tests.commons import initialize_distributed
+from apex.transformer.tensor_parallel.tests.commons import TEST_SUCCESS_MESSAGE
+
+
+global_vars.set_global_variables()
+
+
+def test_initialize_model_parallel(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_model_parallel with size {} ...'.format(
+            tensor_model_parallel_size))
+    tensor_model_parallel_size_ = min(
+        tensor_model_parallel_size,
+        torch.distributed.get_world_size(),
+    )
+    assert not parallel_state.model_parallel_is_initialized()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_)
+    assert parallel_state.model_parallel_is_initialized()
+
+    # Checks.
+    def check(group, world_size, rank):
+        assert world_size == torch.distributed.get_world_size(group=group)
+        assert rank == torch.distributed.get_rank(group=group)
+
+    # Model parallel.
+    world_size = tensor_model_parallel_size_
+    rank = torch.distributed.get_rank() % tensor_model_parallel_size_
+    assert world_size == parallel_state.get_tensor_model_parallel_world_size()
+    assert rank == parallel_state.get_tensor_model_parallel_rank()
+    check(parallel_state.get_tensor_model_parallel_group(), world_size, rank)
+
+    # Data parallel.
+    world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
+    rank = torch.distributed.get_rank() // tensor_model_parallel_size
+    assert world_size == parallel_state.get_data_parallel_world_size()
+    assert rank == parallel_state.get_data_parallel_rank()
+    check(parallel_state.get_data_parallel_group(), world_size, rank)
+
+    # Reset groups
+    parallel_state.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(TEST_SUCCESS_MESSAGE)
+
+
+def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
+            tensor_model_parallel_size_))
+    tensor_model_parallel_size = min(
+        tensor_model_parallel_size_,
+        torch.distributed.get_world_size(),
+    )
+    assert not parallel_state.model_parallel_is_initialized()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size)
+    assert parallel_state.model_parallel_is_initialized()
+
+    # Checks
+    src_rank = torch.distributed.get_rank() - parallel_state.get_tensor_model_parallel_rank()
+    assert parallel_state.get_tensor_model_parallel_src_rank() == src_rank
+
+    # Reset groups
+    parallel_state.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test initialize model parallel')
+        test_initialize_model_parallel(tensor_model_parallel_size)
+        print_separator('test model parallel source rank')
+        test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2