Merge branch 'main' into beam_search

b8428a7f · peng xu · e5034150 · 3f4e71df · b8428a7f · b8428a7f
Commit b8428a7f authored Jul 15, 2022 by peng xu
20 changed files
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-image: gitlab-master.nvidia.com/dl/dgx/pytorch:20.12-py3-devel
+image: gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel

 test:
  script:

--- a/LICENSE
+++ b/LICENSE
@@ -29,12 +29,13 @@ The following applies to all files unless otherwise noted:
 --

 This repository also contains code from Hugging Face Inc., Google Research,
-Facebook (from their Fairseq project), and Philip Popien. Files from these
-organizations have notices at the top of each file. Below are licenses
-used in those files, as indicated.
+Facebook (from their Fairseq and Dino projects), Microsoft(from their 
+Swin-Transformer project)and Philip Popien. Files from these 
+organizations have notices at  the top of each file. Below are 
+licenses used in those files, as indicated.


------------- LICENSE FOR huggingface and Google Research code  --------------
+------------- LICENSE FOR Facebook, huggingface and Google Research code  --------------


                                 Apache License
@@ -263,3 +264,113 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.

+------------- LICENSE FOR Mircrosoft Swin transformer code --------------
+
+MIT License
+
+Copyright (c) Microsoft Corporation.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE
+
+
+--------------- NVIDIA Source Code License for SegFormer -----------------
+1. Definitions
+
+“Licensor” means any person or entity that distributes its Work.
+
+“Software” means the original work of authorship made available under this
+License.
+
+“Work” means the Software and any additions to or derivative works of the
+Software that are made available under this License.
+
+The terms “reproduce,” “reproduction,” “derivative works,” and 
+“distribution” have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative works
+shall not include works that remain separable from, or merely link 
+(or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are “made available” under this License by 
+including in or with the Work either (a) a copyright notice referencing 
+the applicability of this License to the Work, or (b) a copy of this License.
+
+2. License Grant
+
+2.1 Copyright Grant. Subject to the terms and conditions of this License,
+each Licensor grants to you a perpetual, worldwide, non-exclusive, 
+royalty-free, copyright license to reproduce, prepare derivative works of, 
+publicly  display, publicly perform, sublicense and distribute its Work 
+and any resulting derivative works in any form.
+
+3. Limitations
+
+3.1 Redistribution. You may reproduce or distribute the Work only if 
+(a) you do so under this License, (b) you include a complete copy of this 
+License with your distribution, and (c) you retain without modification any
+copyright, patent, trademark, or attribution notices that are present
+in the Work.
+
+3.2 Derivative Works. You may specify that additional or different terms 
+apply to the use, reproduction, and distribution of your derivative works 
+of the Work (“Your Terms”) only if (a) Your Terms provide that the use
+limitation in Section 3.3 applies to your derivative works, and (b) you 
+identify the specific derivative works that are subject to Your Terms. 
+Notwithstanding Your Terms, this License (including the redistribution
+requirements in Section 3.1) will continue to apply to the Work itself.
+
+3.3 Use Limitation. The Work and any derivative works thereof only may 
+be used or intended for use non-commercially. Notwithstanding the 
+foregoing, NVIDIA and its affiliates may use the Work and any derivative
+works commercially. As used herein, “non-commercially” means for research 
+or evaluation purposes only.
+
+3.4 Patent Claims. If you bring or threaten to bring a patent claim against 
+any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) 
+to enforce any patents that you allege are infringed by any Work, then
+your rights under this License from such Licensor (including the grant 
+in Section 2.1) will terminate immediately.
+
+3.5 Trademarks. This License does not grant any rights to use any Licensor’s 
+or its affiliates’ names, logos, or trademarks, except as necessary to 
+reproduce the notices described in this License.
+
+3.6 Termination. If you violate any term of this License, then your rights 
+under this License (including the grant in Section 2.1) will terminate 
+immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF 
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT.
+YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL 
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE 
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT 
+OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK 
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER 
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+
--- a/README.md
+++ b/README.md
-Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf) and [2](https://arxiv.org/pdf/2104.04473.pdf)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel (tensor and pipeline), and multi-node pre-training of transformer based models such as [GPT](https://arxiv.org/abs/2005.14165), [BERT](https://arxiv.org/pdf/1810.04805.pdf), and [T5](https://arxiv.org/abs/1910.10683) using mixed precision.
+Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel ([tensor](https://arxiv.org/pdf/1909.08053.pdf), [sequence](https://arxiv.org/pdf/2205.05198), and [pipeline](https://arxiv.org/pdf/2104.04473.pdf)), and multi-node pre-training of transformer based models such as [GPT](https://arxiv.org/abs/2005.14165), [BERT](https://arxiv.org/pdf/1810.04805.pdf), and [T5](https://arxiv.org/abs/1910.10683) using mixed precision.

 Below are some of the projects where we have directly used Megatron:
 * [BERT and GPT Studies Using Megatron](https://arxiv.org/pdf/1909.08053.pdf)
@@ -8,19 +8,26 @@ Below are some of the projects where we have directly used Megatron:
 * [Local Knowledge Powered Conversational Agents](https://arxiv.org/abs/2010.10150)
 * [MEGATRON-CNTRL: Controllable Story Generation with External Knowledge Using Large-Scale Language Models](https://www.aclweb.org/anthology/2020.emnlp-main.226.pdf)
 * [RACE Reading Comprehension Dataset Leaderboard](http://www.qizhexie.com/data/RACE_leaderboard.html)
-* [Scaling Language Model Training to a Trillion Parameters Using Megatron](https://arxiv.org/pdf/2104.04473.pdf)
 * [Training Question Answering Models From Synthetic Data](https://www.aclweb.org/anthology/2020.emnlp-main.468.pdf)
+* [Few-shot Instruction Prompts for Pretrained Language Models to Detect Social Biases](https://arxiv.org/abs/2112.07868)
+* [Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://arxiv.org/abs/2202.04173)
+* [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](https://arxiv.org/abs/2201.11990)
+* [Multi-Stage Prompting for Knowledgeable Dialogue Generation](https://arxiv.org/abs/2203.08745)

 Megatron is also used in [NeMo Megatron](https://developer.nvidia.com/nvidia-nemo#nemo-megatron), a framework to help enterprises overcome the challenges of building and training sophisticated natural language processing models with billions and trillions of parameters.

-Our codebase is capable of efficiently training very large (hundreds of billions of parameters) language models with both model and data parallelism. To demonstrate how the code scales with multiple GPUs and model sizes, we consider GPT models from 1 billion all the way to 1 trillion parameters. All models use a vocabulary size of 51,200 and a sequence length of 2048. We vary hidden size, number of attention heads, and number of layers to arrive at a specifc model size. As the model size increases, we also modestly increase the batch size. We leverage [NVIDIA's Selene supercomputer](https://www.top500.org/system/179842/) to perform scaling studies and use up to 3072 [A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for the largest model. Each cluster node has 8 NVIDIA 80GB A100 GPUs. The table below shows the model configurations along with the achieved FLOPs (both per GPU and aggregate over all GPUs). Note that these results are from benchmark runs and these models were not trained to convergence; however, the FLOPs are measured for end-to-end training, i.e., includes all operations including data loading, optimization, and even logging.
+Our codebase is capable of efficiently training very large (hundreds of billions of parameters) language models with both model and data parallelism. To demonstrate how the code scales with multiple GPUs and model sizes, we consider GPT models from 1 billion all the way to 1 trillion parameters. All models use a vocabulary size of 51,200 and a sequence length of 2048. We vary hidden size, number of attention heads, and number of layers to arrive at a specifc model size. As the model size increases, we also modestly increase the batch size. We leverage [NVIDIA's Selene supercomputer](https://www.top500.org/system/179842/) to perform scaling studies and use up to 3072 [A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for the largest model. Each cluster node has 8 NVIDIA 80GB A100 GPUs. The graph below shows that we scale nearly linear up to 1 trillion parameter models running on 3072 GPUs. Note that these results are from benchmark runs and these models were not trained to convergence; however, the FLOPs are measured for end-to-end training, i.e., includes all operations including data loading, optimization, and even logging.

-Additionally, the model parallel size column reports a combined tensor and pipeline parallelism degrees. For numbers larger than 8, typically tensor parallel of size 8 was used. So, for example, the 145B model reports the total model parallel size of 64, which means that this setup used TP=8 and PP=8.
+![Scaling Graph](images/Achieved_petaFLOPs.png)

-![Cases](images/cases_april2021.png)
-
-All the cases from 1 billion to 1 trillion parameters achieve more than 43% half precision utilization, which is high for an end-to-end application. We observe that initially the utilization remains constant but as hidden size increases for larger models, utilization starts increasing and reaches 52% for the largest model. We also note that achieved aggregate petaFLOPs across all GPUs increases almost linearly with number of GPUs, demonstrating good weak scaling.
+The following table shows both model (MFU) and hardware (HFU) FLOPs utilization for select configurations up to 1T parameters (see [our paper](https://arxiv.org/pdf/2205.05198) for a description of how these are calculated). As the model size increases, we achieve better GPU utilization and for the one trillion parameter model, we reach a MFU and HFU of 56.3% and 57.0%, respectively. Note that these numbers are also measured on benchmark runs and in this case are measured using a data parallel size of one. Data parallelism introduces some overhead due to the gradient all-reduce required between the data parallel groups. However, for large transformer models, this overhead is not large and can almost entirely eliminted by overlapping the gradient all-reduce with backpropagation.

+| Model Size | Model FLOPs Utilization | Hardware FLOPs Utilization |
+| :---: | :---: | :---: |
+| 22B   | 41.5% | 43.7% |
+| 175B  | 51.4% | 52.8% |
+| 530B  | 56.0% | 57.0% |
+| 1T    | 56.3% | 57.0% |

 # Contents
   * [Contents](#contents)
@@ -257,7 +264,9 @@ The `examples/pretrain_{bert,gpt,t5}_distributed.sh` scripts use the PyTorch dis

 We use two types of parallelism: data and model parallelism. We facilitate two distributed data parallel implementations: a simple one of our own that performs gradient all-reduce at the end of back propagation step, and Torch's distributed data parallel wrapper that overlaps gradient reduction with back propagation computation. To switch between these two options use `--DDP-impl local` or `--DDP-impl torch`, respectively. As expected, Torch distributed data parallelism is more efficient at larger model sizes. For example, for the 8.3 billion parameters model running on 512 GPUs, the scaling increases from 60% to 76% when Torch's distributed data parallel is used. However, the overlapping method requires more memory and for some configurations (e.g., 2.5 billion parameters using 2-way model parallel and 1.2 billion parameters with no model parallel) can make the overall training slower as a result. We empirically found that using a smaller model in those cases improves the training time.

-Second, we developed a simple and efficient two-dimensional model-parallel approach. To use tensor model parallelism (splitting execution of a single transformer module over multiple GPUs), add the `--tensor-model-parallel-size` flag to specify the number of GPUs among which to split the model, along with the arguments passed to the distributed launcher as mentioned above. To use pipeline model parallelism (sharding the transformer modules into stages with an equal number of transformer modules on each stage, and then pipelining execution by breaking the batch into smaller microbatches), use the `--pipeline-model-parallel-size` flag to specify the number of stages to split the model into (e.g., splitting a model with 24 transformer layers across 4 stages would mean each stage gets 6 transformer layers each).
+Second, we developed a simple and efficient two-dimensional model-parallel approach. To use tensor model parallelism (splitting execution of a single transformer module over multiple GPUs), add the `--tensor-model-parallel-size` flag to specify the number of GPUs among which to split the model, along with the arguments passed to the distributed launcher as mentioned above. To use sequence parallelism specify `--sequence-parallel`, which requires tensor model parallel as it split among the same GPUs.
+
+To use pipeline model parallelism (sharding the transformer modules into stages with an equal number of transformer modules on each stage, and then pipelining execution by breaking the batch into smaller microbatches), use the `--pipeline-model-parallel-size` flag to specify the number of stages to split the model into (e.g., splitting a model with 24 transformer layers across 4 stages would mean each stage gets 6 transformer layers each).

 <!-- The number of microbatches in a per-pipeline minibatch is controlled by the `--num-microbatches-in-minibatch` argument. With `WORLD_SIZE` GPUs, `TENSOR_MP_SIZE` tensor-model-parallel size, `PIPELINE_MP_SIZE` pipeline-model-parallel-size, `WORLD_SIZE`/(`TENSOR_MP_SIZE` * `PIPELINE_MP_SIZE`) GPUs will be used for data parallelism. The default values for `--tensor-model-parallel-size` and `--pipeline-model-parallel-size` is 1, which will not implement either form of model parallelism. -->

@@ -291,6 +300,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./pretrain_<model>.py \
                --data-path $DATA_PATH \
                --tensor-model-parallel-size $TENSOR_MP_SIZE \
                --pipeline-model-parallel-size $PIPELINE_MP_SIZE \
+                --sequence-parallel \
                --DDP-impl torch
 </pre>

@@ -298,11 +308,13 @@ The interleaved pipelining schedule (more details in Section 2.2.2 of [our paper

 ## Activation Checkpointing and Recomputation

-To reduce GPU memory usage so deploy a large model to a training system, we support activation checkpointing and recomputation. We use a Transformer layer as the unit of checkpointing because the activation size bloats in the middle of a Transformer layer so checkpointing the input of a Transformer layer is storage-efficient. We support two activation checkpointing methods: `uniform` and `block`.
+To reduce GPU memory usage so deploy a large model to a training system, we support activation checkpointing and recomputation. We support two levels of recompute granularity: `selective` and `full`. Selective recomputation is the default and recommended in almost all cases. It saves the activations that take less space and are expensive to recompute and recomputes activations that take a lot of space but are relatively cheap to recompute (see [our paper](https://arxiv.org/pdf/2205.05198) for details). To enable selective activation recompute simply use `--recompute-activations`.
+
+For cases where memory is very tight, `full` checkpointing saves just the inputs to a transformer layer, or a block of transformer layers, and recomputes everything else. To turn on full activation recompute use `--recompute-granularity full`. When using full activation recomputation, there are two methods: `uniform` and `block`, chosen using the `--recompute-method` argument.

-Uniform method uniformly divides the Transformer layers into groups of layers and stores the input activations of each group in the memory. The baseline group size is 1 and, in this case, the input activation of each Transformer layer is checkpointed. When the GPU memory is insufficient, increasing the number of layers per group reduces the memory usage thus enables running a bigger model. For example, when using the number of layers per group of 4, the input activation of each group of 4 Transformer layers is checkpointed.
+* Uniform method uniformly divides the Transformer layers into groups of layers and stores the input activations of each group in the memory. The baseline group size is 1 and, in this case, the input activation of each Transformer layer is checkpointed. When the GPU memory is insufficient, increasing the number of layers per group reduces the memory usage thus enables running a bigger model. For example, when using the number of layers per group of 4, the input activation of each group of 4 Transformer layers is checkpointed.

-Block method checkpoints the input activations of a set number of individual Transformer layers per pipeline stage and do the rest of layers without any checkpointing. This method can be used to skip checkpointing some Transformer layers until the GPU memory is fully used, which is applicable only when there is unused GPU memory. Checkpointing fewer transformer layers avoids unnecessary activation recomputation in the backprop thus improves training performance. For example, when we specify 5 layers to checkpoint of 8 layers per pipeline stage, the input activations of only the first 5 Transformer layers are checkpointed and activation recomputation for the rest 3 layers is not needed in the backprop.
+* Block method checkpoints the input activations of a set number of individual Transformer layers per pipeline stage and do the rest of layers without any checkpointing. This method can be used to skip checkpointing some Transformer layers until the GPU memory is fully used, which is applicable only when there is unused GPU memory. Checkpointing fewer transformer layers avoids unnecessary activation recomputation in the backprop thus improves training performance. For example, when we specify 5 layers to checkpoint of 8 layers per pipeline stage, the input activations of only the first 5 Transformer layers are checkpointed and activation recomputation for the rest 3 layers is not needed in the backprop.


 ## GPT-3 Example

--- a/docs/distrib_optimizer.md
+++ b/docs/distrib_optimizer.md
+# Distributed Optimizer
+
+The motivation for the distributed optimizer is to save memory by distributing the optimizer state evenly across data parallel ranks, versus the current method of replicating the optimizer state across data parallel ranks. As described in https://arxiv.org/abs/1910.02054, this branch specifically implements the following:
+
+- [yes] distribute all 'non-overlapping' optimizer state (i.e., model params already in fp32 are NOT distributed)
+- [no] distribute model gradients
+- [no] distribute model parameters
+
+Theoretical memory savings vary depending on the combination of the model's param dtype and grad dtype. In the current implementation, the theoretical number of bytes per parameter is (where 'd' is the data parallel size):
+
+|        | Non-distributed optim | Distributed optim |
+| ------ | ------ | ------ |
+| float16 param, float16 grads | 20 | 4 + 16/d |
+| float16 param, fp32 grads    | 18 | 6 + 12/d |
+| fp32 param, fp32 grads       | 16 | 8 + 8/d  |
+
+The implementation of the distributed optimizer is centered on using the contiguous grad buffer for communicating grads & params between the model state and the optimizer state. The grad buffer at any given moment either holds:
+
+1. all model grads
+2. a 1/d size _copy_ of the main grads (before copying to the optimizer state)
+3. a 1/d size _copy_ of the main params (after copying from the optimizer state)
+4. all model params
+5. zeros (or None), between iterations
+
+The grad buffer is used for performing reduce-scatter and all-gather operations, for passing grads & params between the model state and optimizer state. With this implementation, no dynamic buffers are allocated.
+
+The figures below illustrate the grad buffer's sharding scheme, and the key steps of the distributed optimizer's param update:
+
+## Data flow
+
+![Data flow](images/distrib_optimizer/data_flow.png)
+
+## Sharding scheme
+
+![Sharding scheme](images/distrib_optimizer/sharding_scheme.png)
+
+## Key steps
+
+_(note: using illustrations above, and assuming fp16 grads)_
+
+- Backward pass finishes (grad buffer holds 16 fp16 grad elements)
+- Call reduce-scatter on each DP rank
+- Each DP rank now has 4 elements within the grad buffer that are fully reduced (remaining 12 elements are garbage)
+- Each DP rank copies its relevant 4 fp16 grad elements from the grad buffer into 4 fp32 main grad elements (separate buffer, owned by the optimizer); i.e.
+  - DP rank 0 copies elements [0:4]
+  - DP rank 1 copies elements [4:8]
+  - DP rank 2 copies elements [8:12]
+  - DP rank 3 copies elements [12:16]
+- Optimizer.step()
+- Each DP rank copies its 4 fp32 main (/optimizer) param elements into the corresponding 4 fp16 elements in the grad buffer
+- Call all-gather on each DP rank
+- Grad buffer now contains all 16, fully updated, fp16 model param elements
+- Copy updated model params from grad buffer into their respective param tensors
+- (At this point, grad buffer is ready to be zero'd for the next iteration)
--- a/docs/images/distrib_optimizer/data_flow.png
+++ b/docs/images/distrib_optimizer/data_flow.png
--- a/docs/images/distrib_optimizer/sharding_scheme.png
+++ b/docs/images/distrib_optimizer/sharding_scheme.png
--- a/examples/pretrain_gpt_distributed_with_mp.sh
+++ b/examples/pretrain_gpt_distributed_with_mp.sh
@@ -19,6 +19,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       pretrain_gpt.py \
       --tensor-model-parallel-size 2 \
       --pipeline-model-parallel-size 2 \
+       --sequence-parallel \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \

--- a/images/Achieved_petaFLOPs.png
+++ b/images/Achieved_petaFLOPs.png
--- a/megatron/__init__.py
+++ b/megatron/__init__.py
@@ -23,6 +23,7 @@ from .global_vars import get_tokenizer
 from .global_vars import get_tensorboard_writer
 from .global_vars import get_adlr_autoresume
 from .global_vars import get_timers
+from .global_vars import get_global_memory_buffer
 from .initialize  import initialize_megatron

 def print_rank_0(message):

--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -39,7 +39,7 @@ def parse_args(extra_args_provider=None, defaults={},
    parser = _add_data_args(parser)
    parser = _add_autoresume_args(parser)
    parser = _add_biencoder_args(parser)
-    parser = _add_vit_args(parser)
+    parser = _add_vision_args(parser)
    parser = _add_logging_args(parser)
    parser = _add_inference_args(parser)

@@ -103,14 +103,20 @@ def parse_args(extra_args_provider=None, defaults={},
    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
        'longer valid, use --tensor-model-parallel-size instead'
    del args.model_parallel_size
+
    if args.checkpoint_activations:
-        args.activations_checkpoint_method = 'uniform'
+        args.recompute_granularity = 'full'
+        args.recompute_method = 'uniform'
        if args.rank == 0:
            print('--checkpoint-activations is no longer valid, '
-                  'use --activation-checkpoint-method instead. '
-                  'Defaulting to activation-checkpoint-method=uniform.')
+                  'use --recompute-granularity and --recompute-method  instead. '
+                  'Defaulting to recompute-granularity=full and recompute-method=uniform.')
    del args.checkpoint_activations

+    if args.recompute_activations:
+        args.recompute_granularity = 'selective'
+    del args.recompute_activations
+
    # Set input defaults.
    for key in defaults:
        # For default to be valid, it should not be provided in the
@@ -172,6 +178,20 @@ def parse_args(extra_args_provider=None, defaults={},
    if args.accumulate_allreduce_grads_in_fp32:
        assert args.DDP_impl == 'local'
        assert args.use_contiguous_buffers_in_local_ddp
+    else:
+        if args.gradient_accumulation_fusion:
+            args.gradient_accumulation_fusion = False
+            if args.rank == 0:
+                print('Gradient accumulation fusion to linear layer weight '
+                      'gradient computation is supported only with fp32 '
+                      'gradient accumulation. Setting gradient_accumulation_fusion '
+                      'to False', flush=True)
+
+    # If we use the distributed optimizer, we need to have local DDP
+    # and we should make sure use-contiguous-buffers-in-local-ddp is on.
+    if args.use_distributed_optimizer:
+        assert args.DDP_impl == 'local'
+        assert args.use_contiguous_buffers_in_local_ddp

    # For torch DDP, we do not use contiguous buffer
    if args.DDP_impl == 'torch':
@@ -270,19 +290,38 @@ def parse_args(extra_args_provider=None, defaults={},
                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
                  'Defaulting to no_persist_layer_norm=True')

-    # Activation checkpointing.
-    if args.distribute_checkpointed_activations:
+    # Activation recomputing.
+    if args.distribute_saved_activations:
        assert args.tensor_model_parallel_size > 1, 'can distribute ' \
-            'checkpointed activations only across tensor model ' \
+            'recomputed activations only across tensor model ' \
            'parallel groups'
-        assert args.activations_checkpoint_method is not None, \
-            'for distributed checkpoint activations to work you '\
-            'need to use a activation-checkpoint method '
+        assert args.recompute_granularity == 'full', \
+            'distributed recompute activations is only '\
+            'application to full recompute granularity'
+        assert args.recompute_method is not None, \
+            'for distributed recompute activations to work you '\
+            'need to use a recompute method '
        assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 10, \
-            'distributed checkpoint activations are supported for pytorch ' \
+            'distributed recompute activations are supported for pytorch ' \
            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
            'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR)

+    if args.recompute_granularity == 'selective':
+        assert args.recompute_method is None, \
+            'recompute method is not yet supported for ' \
+            'selective recomputing granularity'
+
+    # disable sequence parallelism when tp=1
+    # to avoid change in numerics when
+    # sequence_parallelism is enabled.
+    if args.tensor_model_parallel_size == 1:
+        args.sequence_parallel = False
+
+    # disable async_tensor_model_parallel_allreduce when
+    # model parallel memory optimization is enabled
+    if args.sequence_parallel:
+        args.async_tensor_model_parallel_allreduce = False
+
    _print_args(args)
    return args

@@ -357,7 +396,8 @@ def _add_network_size_args(parser):
    group.add_argument('--bert-no-binary-head', action='store_false',
                       help='Disable BERT binary head.',
                       dest='bert_binary_head')
-
+    group.add_argument('--num-experts', type=int, default=None,
+                       help='Number of Experts in Switch Transformer (None means no Switch)')
    return parser


@@ -462,27 +502,40 @@ def _add_training_args(parser):
                       ' (1024 - 16) / 8 = 126 intervals will increase'
                       'the batch size linearly to 1024. In each interval'
                       'we will use approximately 300000 / 126 = 2380 samples.')
-    group.add_argument('--checkpoint-activations', action='store_true',
-                       help='Checkpoint activation to allow for training '
+    group.add_argument('--recompute-activations', action='store_true',
+                       help='recompute activation to allow for training '
                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--distribute-checkpointed-activations',
+    group.add_argument('--recompute-granularity', type=str, default=None,
+                       choices=['full', 'selective'],
+                       help='Checkpoint activations to allow for training '
+                       'with larger models, sequences, and batch sizes. '
+                       'It is supported at two granularities 1) full: '
+                       'whole transformer layer is recomputed, '
+                       '2) selective: core attention part of the transformer '
+                       'layer is recomputed.')
+    group.add_argument('--distribute-saved-activations',
                       action='store_true',
-                       help='If set, distribute checkpointed activations '
+                       help='If set, distribute recomputed activations '
                       'across model parallel group.')
-    group.add_argument('--activations-checkpoint-method', type=str, default=None,
+    group.add_argument('--recompute-method', type=str, default=None,
                       choices=['uniform', 'block'],
                       help='1) uniform: uniformly divide the total number of '
-                       'Transformer layers and checkpoint the input activation of '
-                       'each divided chunk, '
-                       '2) checkpoint the input activations of only a set number of '
+                       'Transformer layers and recompute the input activation of '
+                       'each divided chunk at specified granularity, '
+                       '2) recompute the input activations of only a set number of '
                       'individual Transformer layers per pipeline stage and do the '
-                       'rest without any checkpointing'
-                       'default) do not apply activations checkpoint to any layers')
-    group.add_argument('--activations-checkpoint-num-layers', type=int, default=1,
+                       'rest without any recomputing at specified granularity'
+                       'default) do not apply activations recompute to any layers')
+    group.add_argument('--recompute-num-layers', type=int, default=1,
                       help='1) uniform: the number of Transformer layers in each '
-                       'uniformly divided checkpoint unit, '
+                       'uniformly divided recompute unit, '
                       '2) block: the number of individual Transformer layers '
-                       'to checkpoint within each pipeline stage.')
+                       'to recompute within each pipeline stage.')
+
+    # deprecated
+    group.add_argument('--checkpoint-activations', action='store_true',
+                       help='Checkpoint activation to allow for training '
+                       'with larger models, sequences, and batch sizes.')
    group.add_argument('--train-iters', type=int, default=None,
                       help='Total number of iterations to train over all '
                       'training runs. Note that either train-iters or '
@@ -521,15 +574,23 @@ def _add_training_args(parser):
                       choices=['single', 'cyclic'],
                       help='Single pass vs multiple pass data loader')
    group.add_argument('--no-async-tensor-model-parallel-allreduce',
-                       action='store_true',
+                       action='store_false',
                       help='Disable asynchronous execution of '
                       'tensor-model-parallel all-reduce with weight '
-                       'gradient compuation of a column-linear layer.')
+                       'gradient compuation of a column-linear layer.',
+                       dest='async_tensor_model_parallel_allreduce')
    group.add_argument('--no-persist-layer-norm', action='store_true',
                       help='Disable using persistent fused layer norm kernel. '
                       'This kernel supports only a set of hidden sizes. Please '
                       'check persist_ln_hidden_sizes if your hidden '
                       'size is supported.')
+    group.add_argument('--sequence-parallel', action='store_true',
+                       help='Enable sequence parallel optimization.')
+    group.add_argument('--no-gradient-accumulation-fusion',
+                       action='store_false',
+                       help='Disable fusing gradient accumulation to weight '
+                       'gradient computation of linear layers',
+                       dest='gradient_accumulation_fusion')
    return parser


@@ -710,6 +771,9 @@ def _add_distributed_args(parser):
                       'is placed on its own pipeline stage, without any '
                       'transformer layers. (For T5, this flag currently only '
                       'affects the encoder embedding.)')
+    group.add_argument('--use-distributed-optimizer', action='store_true',
+                       help='Use distributed optimizer.')
+
    return parser


@@ -856,9 +920,10 @@ def _add_biencoder_args(parser):
    return parser


-def _add_vit_args(parser):
-    group = parser.add_argument_group(title="vit")
+def _add_vision_args(parser):
+    group = parser.add_argument_group(title="vision")

+    # general vision arguements
    group.add_argument('--num-classes', type=int, default=1000,
                       help='num of classes in vision classificaiton task')
    group.add_argument('--img-h', type=int, default=224,
@@ -868,7 +933,7 @@ def _add_vit_args(parser):
    group.add_argument('--num-channels', type=int, default=3,
                       help='Number of channels in input image data')
    group.add_argument('--patch-dim', type=int, default=16,
-                       help='patch dimension used in vit')
+                       help='patch dimension')
    group.add_argument('--classes-fraction', type=float, default=1.0,
                       help='training with fraction of classes.')
    group.add_argument('--data-per-class-fraction', type=float, default=1.0,
@@ -876,5 +941,49 @@ def _add_vit_args(parser):
    group.add_argument('--no-data-sharding', action='store_false',
                       help='Disable data sharding.',
                       dest='data_sharding')
+    group.add_argument('--head-lr-mult', type=float, default=1.0,
+                       help='learning rate multiplier for head during finetuning')
+
+    # pretraining type and backbone selection`
+    group.add_argument('--vision-pretraining', action='store_true',
+                       help='flag to indicate vision pretraining')
+    group.add_argument('--vision-pretraining-type', type=str, default='classify',
+                       choices=['classify', 'inpaint', 'dino'],
+                       help='pretraining objectives')
+    group.add_argument('--vision-backbone-type', type=str, default='vit',
+                       choices=['vit', 'mit', 'swin'],
+                       help='backbone types types')
+    group.add_argument('--swin-backbone-type', type=str, default='tiny',
+                       choices=['tiny', 'base', 'h3'],
+                       help='pretraining objectives')
+    
+    # inpainting arguments
+    group.add_argument('--mask-type', type=str, default='random',
+                       choices=['random', 'row'],
+                       help='mask types')
+    group.add_argument('--mask-factor', type=float, default=1.0,
+                       help='mask size scaling parameter')
+ 
+    # dino arguments
+    group.add_argument('--iter-per-epoch', type=int, default=1250,
+                       help='iterations per epoch')
+    group.add_argument('--dino-local-img-size', type=int, default=96,
+                       help='Image size for vision classification task')
+    group.add_argument('--dino-local-crops-number', type=int, default=10,
+                       help='Number of local crops')
+    group.add_argument('--dino-head-hidden-size', type=int, default=2048,
+                       help='Hidden dimension size in dino head')
+    group.add_argument('--dino-bottleneck-size', type=int, default=256,
+                       help='Bottle neck dimension in dino head ')
+    group.add_argument('--dino-freeze-last-layer', type=float, default=1,
+                       help='Freezing last layer weights')
+    group.add_argument('--dino-norm-last-layer', action='store_true',
+                       help='Disable Norm in last layer.')
+    group.add_argument('--dino-warmup-teacher-temp', type=float, default=0.04,
+                       help='warump teacher temperature')
+    group.add_argument('--dino-teacher-temp', type=float, default=0.07,
+                       help='teacher temperature')
+    group.add_argument('--dino-warmup-teacher-temp-epochs', type=int, default=30,
+                       help='warmup teacher temperaure epochs')

    return parser
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -81,24 +81,34 @@ def ensure_directory_exists(filename):
        os.makedirs(dirname)


-def get_checkpoint_name(checkpoints_path, iteration,
-                        release=False):
+def get_checkpoint_names(checkpoints_path, iteration, use_distributed_optimizer,
+                         release=False):
    """A unified checkpoint name."""
    if release:
        directory = 'release'
    else:
        directory = 'iter_{:07d}'.format(iteration)
-    # Use both the tensor and pipeline MP rank.
+    # Use both the tensor and pipeline MP rank. If using the distributed
+    # optimizer, then the optimizer's path must additionally include the
+    # data parallel rank.
    if mpu.get_pipeline_model_parallel_world_size() == 1:
-        return os.path.join(checkpoints_path, directory,
-                            'mp_rank_{:02d}'.format(
-                                mpu.get_tensor_model_parallel_rank()),
-                            'model_optim_rng.pt')
-    return os.path.join(checkpoints_path, directory,
-                        'mp_rank_{:02d}_{:03d}'.format(
-                            mpu.get_tensor_model_parallel_rank(),
-                            mpu.get_pipeline_model_parallel_rank()),
-                        'model_optim_rng.pt')
+        common_path = os.path.join(checkpoints_path, directory,
+                                   'mp_rank_{:02d}'.format(
+                                       mpu.get_tensor_model_parallel_rank()))
+    else:
+        common_path = os.path.join(checkpoints_path, directory,
+                                   'mp_rank_{:02d}_{:03d}'.format(
+                                       mpu.get_tensor_model_parallel_rank(),
+                                       mpu.get_pipeline_model_parallel_rank()))
+
+    if use_distributed_optimizer:
+        model_name = os.path.join(common_path, "model_rng.pt")
+        optim_name = os.path.join(
+            common_path + "_%03d" % mpu.get_data_parallel_rank(),
+            "optim.pt")
+    else:
+        model_name = optim_name = os.path.join(common_path, "model_optim_rng.pt")
+    return model_name, optim_name


 def get_checkpoint_tracker_filename(checkpoints_path):
@@ -177,38 +187,64 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
    print_rank_0('saving checkpoint at iteration {:7d} to {}'.format(
        iteration, args.save))

-    # collect rng state across data parallel ranks
+    # Collect rng state across data parallel ranks.
    rng_state = get_rng_state()

-    if not torch.distributed.is_initialized() or mpu.get_data_parallel_rank() == 0:
+    # Checkpoint file names.
+    model_checkpoint_name, optim_checkpoint_name = \
+        get_checkpoint_names(args.save, iteration, args.use_distributed_optimizer)
+
+    # Collect args, model, RNG.
+    model_state_dict = {}
+    if not torch.distributed.is_initialized() \
+       or mpu.get_data_parallel_rank() == 0:

        # Arguments, iteration, and model.
-        state_dict = {}
-        state_dict['args'] = args
-        state_dict['checkpoint_version'] = 3.0
-        state_dict['iteration'] = iteration
+        model_state_dict['args'] = args
+        model_state_dict['checkpoint_version'] = 3.0
+        model_state_dict['iteration'] = iteration
        if len(model) == 1:
-            state_dict['model'] = model[0].state_dict_for_save_checkpoint()
+            model_state_dict['model'] = model[0].state_dict_for_save_checkpoint()
        else:
            for i in range(len(model)):
                mpu.set_virtual_pipeline_model_parallel_rank(i)
-                state_dict['model%d' % i] = model[i].state_dict_for_save_checkpoint()
-
-        # Optimizer stuff.
-        if not args.no_save_optim:
-            if optimizer is not None:
-                state_dict['optimizer'] = optimizer.state_dict()
-            if opt_param_scheduler is not None:
-                state_dict['opt_param_scheduler'] = opt_param_scheduler.state_dict()
+                model_state_dict['model%d' % i] = \
+                    model[i].state_dict_for_save_checkpoint()

        # RNG states.
        if not args.no_save_rng:
-            state_dict["rng_state"] = rng_state
+            model_state_dict["rng_state"] = rng_state

-        # Save.
-        checkpoint_name = get_checkpoint_name(args.save, iteration)
-        ensure_directory_exists(checkpoint_name)
-        torch.save(state_dict, checkpoint_name)
+    # Collect optimizer state. (Optimizer is saved separately from the model, due
+    # to the conflicting data pattern when using the distributed optimizer.)
+    optim_state_dict = {}
+    if not args.no_save_optim \
+       and (not torch.distributed.is_initialized()
+            or mpu.get_data_parallel_rank() == 0
+            or args.use_distributed_optimizer):
+
+        # Optimizer stuff.
+        if optimizer is not None:
+            optim_state_dict['optimizer'] = optimizer.state_dict()
+        if opt_param_scheduler is not None:
+            optim_state_dict['opt_param_scheduler'] = \
+                opt_param_scheduler.state_dict()
+
+    # Save.
+    if args.use_distributed_optimizer:
+        # Save model separate from optimizer.
+        if model_state_dict:
+            ensure_directory_exists(model_checkpoint_name)
+            torch.save(model_state_dict, model_checkpoint_name)
+        if optim_state_dict:
+            ensure_directory_exists(optim_checkpoint_name)
+            torch.save(optim_state_dict, optim_checkpoint_name)
+    else:
+        # Save model and optimizer together.
+        state_dict = {**model_state_dict, **optim_state_dict}
+        if state_dict: # only saves if populated (i.e., inherits conditions above)
+            ensure_directory_exists(model_checkpoint_name)
+            torch.save(state_dict, model_checkpoint_name)

    # Wait so everyone is done (necessary)
    if torch.distributed.is_initialized():
@@ -322,12 +358,19 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
    iteration, release = read_metadata(tracker_filename)

    # Checkpoint.
-    checkpoint_name = get_checkpoint_name(load_dir, iteration, release)
+    model_checkpoint_name, optim_checkpoint_name = \
+        get_checkpoint_names(load_dir, iteration,
+                             args.use_distributed_optimizer,
+                             release)
    print_rank_0(f' loading checkpoint from {args.load} at iteration {iteration}')

    # Load the checkpoint.
    try:
-        state_dict = torch.load(checkpoint_name, map_location='cpu')
+        model_state_dict = torch.load(model_checkpoint_name, map_location='cpu')
+        if args.use_distributed_optimizer:
+            optim_state_dict = torch.load(optim_checkpoint_name, map_location='cpu')
+        else:
+            optim_state_dict = model_state_dict
    except ModuleNotFoundError:
        from megatron.fp16_deprecated import loss_scaler
        # For backward compatibility.
@@ -336,7 +379,8 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
            'megatron.fp16_deprecated.loss_scaler']
        sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
            'megatron.fp16_deprecated.loss_scaler']
-        state_dict = torch.load(checkpoint_name, map_location='cpu')
+        model_state_dict = torch.load(model_checkpoint_name, map_location='cpu')
+        optim_state_dict = torch.load(optim_checkpoint_name, map_location='cpu')
        sys.modules.pop('fp16.loss_scaler', None)
        sys.modules.pop('megatron.fp16.loss_scaler', None)
    except BaseException as e:
@@ -344,18 +388,18 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
        print_rank_0(e)
        sys.exit()

-    # set checkpoint version
-    set_checkpoint_version(state_dict.get('checkpoint_version', 0))
+    # Set checkpoint version.
+    set_checkpoint_version(model_state_dict.get('checkpoint_version', 0))

    # Set iteration.
    if args.finetune or release:
        iteration = 0
    else:
        try:
-            iteration = state_dict['iteration']
+            iteration = model_state_dict['iteration']
        except KeyError:
            try:  # Backward compatible with older checkpoints
-                iteration = state_dict['total_iters']
+                iteration = model_state_dict['total_iters']
            except KeyError:
                print_rank_0('A metadata file exists but unable to load '
                             'iteration from checkpoint {}, exiting'.format(
@@ -365,8 +409,8 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
    # Check arguments.
    assert args.consumed_train_samples == 0
    assert args.consumed_valid_samples == 0
-    if 'args' in state_dict:
-        checkpoint_args = state_dict['args']
+    if 'args' in model_state_dict:
+        checkpoint_args = model_state_dict['args']
        check_checkpoint_args(checkpoint_args)
        args.consumed_train_samples = getattr(checkpoint_args,
                                              'consumed_train_samples', 0)
@@ -378,11 +422,11 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri

    # Model.
    if len(model) == 1:
-        model[0].load_state_dict(state_dict['model'], strict=strict)
+        model[0].load_state_dict(model_state_dict['model'], strict=strict)
    else:
        for i in range(len(model)):
            mpu.set_virtual_pipeline_model_parallel_rank(i)
-            model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
+            model[i].load_state_dict(model_state_dict['model%d' % i], strict=strict)

    # Fix up query/key/value matrix ordering if needed
    checkpoint_version = get_checkpoint_version()
@@ -393,12 +437,12 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
    if not release and not args.finetune and not args.no_load_optim:
        try:
            if optimizer is not None:
-                optimizer.load_state_dict(state_dict['optimizer'])
+                optimizer.load_state_dict(optim_state_dict['optimizer'])
            if opt_param_scheduler is not None:
-                if 'lr_scheduler' in state_dict: # backward compatbility
-                    opt_param_scheduler.load_state_dict(state_dict['lr_scheduler'])
+                if 'lr_scheduler' in optim_state_dict: # backward compatbility
+                    opt_param_scheduler.load_state_dict(optim_state_dict['lr_scheduler'])
                else:
-                    opt_param_scheduler.load_state_dict(state_dict['opt_param_scheduler'])
+                    opt_param_scheduler.load_state_dict(optim_state_dict['opt_param_scheduler'])
        except KeyError:
            print_rank_0('Unable to load optimizer from checkpoint {}. '
                         'Specify --no-load-optim or --finetune to prevent '
@@ -409,13 +453,13 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
    # rng states.
    if not release and not args.finetune and not args.no_load_rng:
        try:
-            if 'rng_state' in state_dict:
+            if 'rng_state' in model_state_dict:
                # access rng_state for data parallel rank
                if args.data_parallel_random_init:

-                    rng_state = state_dict['rng_state'][mpu.get_data_parallel_rank()]
+                    rng_state = model_state_dict['rng_state'][mpu.get_data_parallel_rank()]
                else:
-                    rng_state = state_dict['rng_state'][0]
+                    rng_state = model_state_dict['rng_state'][0]
                random.setstate(rng_state['random_rng_state'])
                np.random.set_state(rng_state['np_rng_state'])
                torch.set_rng_state(rng_state['torch_rng_state'])
@@ -426,15 +470,15 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                mpu.get_cuda_rng_tracker().set_states(
                    rng_state['rng_tracker_states'])
            else:  # backward compatability
-                random.setstate(state_dict['random_rng_state'])
-                np.random.set_state(state_dict['np_rng_state'])
-                torch.set_rng_state(state_dict['torch_rng_state'])
-                torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
+                random.setstate(model_state_dict['random_rng_state'])
+                np.random.set_state(model_state_dict['np_rng_state'])
+                torch.set_rng_state(model_state_dict['torch_rng_state'])
+                torch.cuda.set_rng_state(model_state_dict['cuda_rng_state'])
                # Check for empty states array
-                if not state_dict['rng_tracker_states']:
+                if not model_state_dict['rng_tracker_states']:
                    raise KeyError
                mpu.get_cuda_rng_tracker().set_states(
-                    state_dict['rng_tracker_states'])
+                    model_state_dict['rng_tracker_states'])
        except KeyError:
            print_rank_0('Unable to load rng state from checkpoint {}. '
                         'Specify --no-load-rng or --finetune to prevent '
@@ -469,12 +513,14 @@ def load_biencoder_checkpoint(model, only_query_model=False,
    with open(tracker_filename, 'r') as f:
        iteration = int(f.read().strip())

-    checkpoint_name = get_checkpoint_name(load_path, iteration, False)
+    checkpoint_name, _ = get_checkpoint_names(load_path, iteration,
+                                              args.use_distributed_optimizer,
+                                              False)
    if mpu.get_data_parallel_rank() == 0:
        print('global rank {} is loading checkpoint {}'.format(
            torch.distributed.get_rank(), checkpoint_name))

-    state_dict = torch.load(checkpoint_name, map_location='cpu')
+    state_dict = torch.load(model_checkpoint_name, map_location='cpu')
    ret_state_dict = state_dict['model']

    if only_query_model:

--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -295,14 +295,19 @@ class IndexedDatasetBuilder(object):
        index = IndexedDataset(another_file)
        assert index.dtype == self.dtype

+        doc_offset = len(self.sizes)
+
        begin = self.data_offsets[-1]
-        for offset in index.data_offsets[1:]:
-            self.data_offsets.append(begin + offset)
+        for data_offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + data_offset)
        self.sizes.extend(index.sizes)
+
        begin = self.dim_offsets[-1]
        for dim_offset in index.dim_offsets[1:]:
            self.dim_offsets.append(begin + dim_offset)

+        self.doc_idx.extend((doc_offset + index.doc_idx)[1:])
+
        with open(data_file_path(another_file), 'rb') as f:
            while True:
                data = f.read(1024)
@@ -556,8 +561,9 @@ class MMapIndexedDatasetBuilder(object):
        index = MMapIndexedDataset.Index(index_file_path(another_file))
        assert index.dtype == self._dtype

-        for size in index.sizes:
-            self._sizes.append(size)
+        offset = len(self._sizes)
+        self._sizes.extend(index.sizes)
+        self._doc_idx.extend((offset + index.doc_idx)[1:])

        # Concatenate data
        with open(data_file_path(another_file), 'rb') as f:

--- a/megatron/data/vit_dataset.py
+++ b/megatron/data/vit_dataset.py
@@ -22,6 +22,43 @@ from megatron import get_args
 from megatron.data.image_folder import ImageFolder
 from megatron.data.autoaugment import ImageNetPolicy
 from megatron.data.data_samplers import RandomSeedDataset
+from PIL import Image, ImageFilter, ImageOps
+
+
+class GaussianBlur(object):
+    """
+    Apply Gaussian Blur to the PIL image.
+    """
+    def __init__(self, p=0.5, radius_min=0.1, radius_max=2.):
+        self.prob = p
+        self.radius_min = radius_min
+        self.radius_max = radius_max
+
+    def __call__(self, img):
+        do_it = random.random() <= self.prob
+        if not do_it:
+            return img
+
+        return img.filter(
+            ImageFilter.GaussianBlur(
+                radius=random.uniform(self.radius_min, self.radius_max)
+            )
+        )
+
+
+class Solarization(object):
+    """
+    Apply Solarization to the PIL image.
+    """
+    def __init__(self, p):
+        self.p = p
+
+    def __call__(self, img):
+        if random.random() < self.p:
+            return ImageOps.solarize(img)
+        else:
+            return img
+

 class ClassificationTransform():
    def __init__(self, image_size, train=True):
@@ -52,14 +89,160 @@ class ClassificationTransform():
        return output


+class InpaintingTransform():
+    def __init__(self, image_size, train=True):
+
+        args = get_args()
+        self.mask_factor = args.mask_factor
+        self.mask_type = args.mask_type
+        self.image_size = image_size
+        self.patch_size = args.patch_dim
+        self.mask_size = int(self.mask_factor*(image_size[0]/self.patch_size)*(image_size[1]/self.patch_size))
+        self.train = train
+        assert args.fp16 or args.bf16
+        self.data_type = torch.half if args.fp16 else torch.bfloat16
+     
+        if self.train:
+            self.transform = T.Compose([
+                T.RandomResizedCrop(self.image_size),
+                T.RandomHorizontalFlip(),
+                T.ColorJitter(0.4, 0.4, 0.4, 0.1),
+                ImageNetPolicy(),
+                T.ToTensor(),
+                T.ConvertImageDtype(self.data_type)
+            ])
+        else:
+            self.transform = T.Compose([
+                T.Resize(self.image_size, interpolation=2),
+                T.CenterCrop(self.image_size),
+                T.ToTensor(),
+                T.ConvertImageDtype(self.data_type)
+            ])
+
+    def gen_mask(self, image_size, mask_size, mask_type, patch_size):
+        # output: mask as a list with indices for missing patches
+        action_list = [[0, 1], [0, -1], [1, 0], [-1, 0]]
+        assert image_size[0] == image_size[1]
+        img_size_patch = image_size[0] // patch_size
+
+        # drop masked patches
+        mask = torch.zeros((image_size[0], image_size[1]), dtype=torch.float)
+
+        if mask_type == 'random':
+            x = torch.randint(0, img_size_patch, ())
+            y = torch.randint(0, img_size_patch, ())
+            for i in range(mask_size):
+                r = torch.randint(0, len(action_list), ())
+                x = torch.clamp(x + action_list[r][0], min=0, max=img_size_patch - 1)
+                y = torch.clamp(y + action_list[r][1], min=0, max=img_size_patch - 1)
+                x_offset = x * patch_size
+                y_offset = y * patch_size
+                mask[x_offset:x_offset+patch_size, y_offset:y_offset+patch_size] = 1
+        else:
+            assert mask_type == 'row'
+            count = 0
+            for x in reversed(range(img_size_patch)):
+                for y in reversed(range(img_size_patch)):
+                    if (count < mask_size):
+                        count += 1
+                        x_offset = x * patch_size
+                        y_offset = y * patch_size
+                        mask[x_offset:x_offset+patch_size, y_offset:y_offset+patch_size] = 1
+        return mask
+
+    def __call__(self, input):
+        trans_input = self.transform(input)
+        mask = self.gen_mask(self.image_size, self.mask_size, 
+			     self.mask_type, self.patch_size)
+        mask = mask.unsqueeze(dim=0)
+        return trans_input, mask
+
+
+class DinoTransform(object):
+    def __init__(self, image_size, train=True):
+        args = get_args()
+        self.data_type = torch.half if args.fp16 else torch.bfloat16
+
+        flip_and_color_jitter = T.Compose([
+            T.RandomHorizontalFlip(p=0.5),
+            T.RandomApply(
+                [T.ColorJitter(brightness=0.4, contrast=0.4,
+			       saturation=0.2, hue=0.1)],
+                p=0.8
+            ),
+            T.RandomGrayscale(p=0.2),
+        ])
+
+        if args.fp16 or args.bf16:
+            normalize = T.Compose([
+                T.ToTensor(),
+                T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+                T.ConvertImageDtype(self.data_type)
+            ])
+        else:
+            normalize = T.Compose([
+                T.ToTensor(),
+                T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+            ])
+
+        # first global crop
+        scale_const = 0.4
+        self.global_transform1 = T.Compose([
+            T.RandomResizedCrop(image_size,
+                                scale=(scale_const, 1),
+                                interpolation=Image.BICUBIC),
+            flip_and_color_jitter,
+            GaussianBlur(1.0),
+            normalize
+        ])
+        # second global crop
+        self.global_transform2 = T.Compose([
+            T.RandomResizedCrop(image_size,
+                                scale=(scale_const, 1),
+                                interpolation=Image.BICUBIC),
+            flip_and_color_jitter,
+            GaussianBlur(0.1),
+            Solarization(0.2),
+            normalize
+        ])
+        # transformation for the local small crops
+        self.local_crops_number = args.dino_local_crops_number
+        self.local_transform = T.Compose([
+            T.RandomResizedCrop(args.dino_local_img_size,
+                                scale=(0.05, scale_const),
+                                interpolation=Image.BICUBIC),
+            flip_and_color_jitter,
+            GaussianBlur(p=0.5),
+            normalize
+        ])
+
+    def __call__(self, image):
+        crops = []
+        crops.append(self.global_transform1(image))
+        crops.append(self.global_transform2(image))
+        for _ in range(self.local_crops_number):
+            crops.append(self.local_transform(image))
+        return crops
+

 def build_train_valid_datasets(data_path, image_size=224):
    args = get_args()
-    train_transform = ClassificationTransform(image_size)
-    val_transform = ClassificationTransform(image_size, train=False)
+
+    if args.vision_pretraining_type == 'classify':
+        train_transform = ClassificationTransform(image_size)
+        val_transform = ClassificationTransform(image_size, train=False)
+    elif args.vision_pretraining_type == 'inpaint':
+        train_transform = InpaintingTransform(image_size, train=False)
+        val_transform = InpaintingTransform(image_size, train=False)
+    elif args.vision_pretraining_type == 'dino':
+        train_transform = DinoTransform(image_size, train=True)
+        val_transform = ClassificationTransform(image_size, train=False)
+    else:
+        raise Exception('{} vit pretraining type is not supported.'.format(
+                args.vit_pretraining_type))

    # training dataset
-    train_data_path = data_path[0]
+    train_data_path = data_path[0] if len(data_path) <= 2 else data_path[2]
    train_data = ImageFolder(
        root=train_data_path,
        transform=train_transform,

--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -94,6 +94,16 @@ def load(args):
    fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
        "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags)

+    # =================================
+    # Fused gradient accumulation to weight gradient computation of linear layer
+    # =================================
+
+    if args.gradient_accumulation_fusion:
+        sources=[srcpath / 'fused_weight_gradient_dense.cpp',
+                 srcpath / 'fused_weight_gradient_dense.cu']
+        fused_dense_cuda = _cpp_extention_load_helper(
+            "fused_dense_cuda", sources, [])
+

 def _get_cuda_bare_metal_version(cuda_dir):
    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],

--- a/megatron/fused_kernels/fused_weight_gradient_dense.cpp
+++ b/megatron/fused_kernels/fused_weight_gradient_dense.cpp
+#include <torch/torch.h>
+#include <torch/extension.h>
+
+#include <vector>
+#include <stdio.h>
+
+#include "type_shim.h"
+
+
+template <typename T>
+int wgrad_gemm_accum_fp32_cuda(T *input, T *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim);
+
+void wgrad_gemm_accum_fp32(const at::Tensor input, const at::Tensor d_output, at::Tensor d_weight) {
+    at::Tensor input_2d, d_output_2d;
+    // input tensor: collapse to the first dim
+    auto in_sizes = input.sizes();
+    if (input.dim() > 2) {
+        input_2d = input.view({-1, in_sizes[in_sizes.size() - 1]});
+    } else {
+        input_2d = input;
+    }
+    // d_output tensor: collapse to the first dim
+    auto d_out_sizes = d_output.sizes();
+    if (d_output.dim() > 2) {
+        d_output_2d = d_output.view({-1, d_out_sizes[d_out_sizes.size() - 1]});
+    } else {
+        d_output_2d = d_output;
+    }
+
+    int hidden_dim = input_2d.size(0);
+    int in_dim = input_2d.size(1);
+    int out_dim = d_weight.size(0);
+
+    DISPATCH_HALF_BFLOAT_AND_FLOAT(input_2d.scalar_type(), "wgrad_gemm_accum_fp32",
+        int result = wgrad_gemm_accum_fp32_cuda<scalar_t>(
+            input_2d.data_ptr<scalar_t>(),
+            d_output_2d.data_ptr<scalar_t>(),
+            d_weight.data_ptr<float>(),
+            in_dim,
+            hidden_dim,
+            out_dim);
+    );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("wgrad_gemm_accum_fp32", &wgrad_gemm_accum_fp32, "wgrad gemm accum in fp32");
+}
--- a/megatron/fused_kernels/fused_weight_gradient_dense.cu
+++ b/megatron/fused_kernels/fused_weight_gradient_dense.cu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <torch/torch.h>
+
+/* Includes, cuda */
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+
+// BF16 Tensor core wrapper around cublas GEMMEx
+cublasStatus_t gemmex_wrapper(
+    cublasHandle_t handle,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    const float* alpha,
+    at::BFloat16* A,
+    int lda,
+    at::BFloat16* B,
+    int ldb,
+    const float* beta,
+    float* C,
+    int ldc) {
+  return cublasGemmEx(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      CUDA_R_16BF,
+      lda,
+      B,
+      CUDA_R_16BF,
+      ldb,
+      beta,
+      C,
+      CUDA_R_32F,
+      ldc,
+      CUDA_R_32F,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// FP16 Tensor core wrapper around cublas GEMMEx
+cublasStatus_t gemmex_wrapper(
+    cublasHandle_t handle,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    const float* alpha,
+    at::Half* A,
+    int lda,
+    at::Half* B,
+    int ldb,
+    const float* beta,
+    float* C,
+    int ldc) {
+  return cublasGemmEx(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      CUDA_R_16F,
+      lda,
+      B,
+      CUDA_R_16F,
+      ldb,
+      beta,
+      C,
+      CUDA_R_32F,
+      ldc,
+      CUDA_R_32F,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// FP32 Tensor core wrapper around cublas GEMMEx
+cublasStatus_t gemmex_wrapper(
+    cublasHandle_t handle,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    const float* alpha,
+    float* A,
+    int lda,
+    float* B,
+    int ldb,
+    const float* beta,
+    float* C,
+    int ldc) {
+  return cublasGemmEx(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      CUDA_R_32F,
+      lda,
+      B,
+      CUDA_R_32F,
+      ldb,
+      beta,
+      C,
+      CUDA_R_32F,
+      ldc,
+      CUDA_R_32F,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+template <typename T>
+int wgrad_gemm_accum_fp32_cuda(T *input, T *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim) {
+    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+    cudaStream_t stream;
+    cublasGetStream(handle, &stream);
+    const float alpha = 1.0;
+    const float beta  = 1.0;
+    int status = 1;
+
+    status = gemmex_wrapper(
+        handle,
+        CUBLAS_OP_N,
+        CUBLAS_OP_T,
+        in_dim,
+        out_dim,
+        hidden_dim,
+        &alpha,
+        input,
+        in_dim,
+        d_output,
+        out_dim,
+        &beta,
+        d_weight,
+        in_dim);
+    return status;
+}
+
+template int wgrad_gemm_accum_fp32_cuda<at::Half>(at::Half *input, at::Half *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim);
+template int wgrad_gemm_accum_fp32_cuda<at::BFloat16>(at::BFloat16 *input, at::BFloat16 *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim);
+template int wgrad_gemm_accum_fp32_cuda<float>(float *input, float *d_output, float *d_weight, int in_dim, int hidden_dim, int out_dim);
--- a/megatron/fused_kernels/type_shim.h
+++ b/megatron/fused_kernels/type_shim.h
@@ -39,6 +39,32 @@
      }


+#define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...)			\
+  switch(TYPE)								\
+    {									\
+    case at::ScalarType::Half:						\
+      {									\
+	using scalar_t = at::Half;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::BFloat16:					\
+      {									\
+	using scalar_t = at::BFloat16;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::Float:						\
+      {									\
+	using scalar_t = float;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    default:								\
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
+      }
+
+

 #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
  switch(TYPEIN)							\

--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -18,7 +18,8 @@
 import os
 import sys
 import time
-
+from functools import reduce
+import operator
 import torch

 from megatron import dist_signal_handler
@@ -33,7 +34,7 @@ _GLOBAL_TENSORBOARD_WRITER = None
 _GLOBAL_ADLR_AUTORESUME = None
 _GLOBAL_TIMERS = None
 _GLOBAL_SIGNAL_HANDLER = None
-
+_GLOBAL_MEMORY_BUFFER = None

 def get_args():
    """Return arguments."""
@@ -77,15 +78,23 @@ def get_timers():
    _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
    return _GLOBAL_TIMERS

+
 def get_signal_handler():
    _ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler')
    return _GLOBAL_SIGNAL_HANDLER

+
+def get_global_memory_buffer():
+    _ensure_var_is_initialized(_GLOBAL_MEMORY_BUFFER, 'global memory buffer')
+    return _GLOBAL_MEMORY_BUFFER
+
+
 def _set_signal_handler():
    global _GLOBAL_SIGNAL_HANDLER
    _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler')
    _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__()

+
 def set_global_variables(extra_args_provider=None, args_defaults={},
                         ignore_unknown_args=False):
    """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
@@ -98,6 +107,7 @@ def set_global_variables(extra_args_provider=None, args_defaults={},
    _set_tensorboard_writer(args)
    _set_adlr_autoresume(args)
    _set_timers()
+    _set_global_memory_buffer()

    if args.exit_signal_handler:
        _set_signal_handler()
@@ -182,6 +192,12 @@ def _set_timers():
    _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
    _GLOBAL_TIMERS = Timers()

+def _set_global_memory_buffer():
+    """Initialize global buffer"""
+    global _GLOBAL_MEMORY_BUFFER
+    _ensure_var_is_not_initialized(_GLOBAL_MEMORY_BUFFER, 'global memory buffer')
+    _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
+

 def _ensure_var_is_initialized(var, name):
    """Make sure the input variable is not None."""
@@ -273,3 +289,24 @@ class Timers:
                print(string, flush=True)
        else:
            print(string, flush=True)
+
+
+class GlobalMemoryBuffer:
+    """Global buffer to avoid dynamic memory allocations.
+    Caller should ensure that buffers of the same name 
+    are not used concurrently."""
+
+    def __init__(self):
+        self.buffer = {}
+
+    def get_tensor(self, tensor_shape, dtype, name):
+        required_len = reduce(operator.mul, tensor_shape, 1)
+        if self.buffer.get((name, dtype), None) is None or \
+                self.buffer[(name, dtype)].numel() < required_len:
+            self.buffer[(name, dtype)] = \
+                torch.empty(required_len,
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                            requires_grad=False)
+
+        return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -31,6 +31,8 @@ from megatron import mpu
 from megatron.global_vars import set_global_variables
 from megatron.mpu import (set_tensor_model_parallel_rank,
                          set_tensor_model_parallel_world_size)
+from megatron.model.transformer import bias_dropout_add_fused_train
+from megatron.model.fused_bias_gelu import bias_gelu


 def initialize_megatron(extra_args_provider=None, args_defaults={},
@@ -64,9 +66,6 @@ def initialize_megatron(extra_args_provider=None, args_defaults={},
            print('> setting random seeds to {} ...'.format(args.seed))
        _set_random_seed(args.seed, args.data_parallel_random_init)

-    # Set pytorch JIT layer fusion options.
-    _set_jit_fusion_options()
-
    args = get_args()
    if  args.lazy_mpu_init:
        args.use_cpu_initialization=True
@@ -230,7 +229,7 @@ def write_args_to_tensorboard():
                            global_step=args.iteration)


-def _set_jit_fusion_options():
+def set_jit_fusion_options():
    """Set PyTorch JIT layer fusion options."""
    # flags required to enable jit fusion kernels
    TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -251,3 +250,51 @@ def _set_jit_fusion_options():
        torch._C._jit_override_can_fuse_on_cpu(True)
        torch._C._jit_override_can_fuse_on_gpu(True)

+    _warmup_jit_function()
+
+
+def _warmup_jit_function():
+    """ Compilie JIT functions before the main training steps """
+    args = get_args()
+    if args.bf16:
+        dtype = torch.bfloat16
+    elif args.fp16:
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+
+    # Warmup fused bias+gelu
+    bias = torch.rand(args.ffn_hidden_size // args.tensor_model_parallel_size,
+                      dtype=dtype, device='cuda')
+    input = torch.rand((args.seq_length, args.micro_batch_size,
+                        args.ffn_hidden_size // args.tensor_model_parallel_size),
+                       dtype=dtype, device='cuda')
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for bias_grad, input_grad in zip([True, True], [False, True]):
+        bias.requires_grad, input.requires_grad = bias_grad, input_grad
+        for _ in range(5):
+            output = bias_gelu(bias, input)
+    del bias, input, output
+
+    # Warmup fused bias+dropout+add
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+    input = torch.rand((seq_length, args.micro_batch_size, args.hidden_size),
+                       dtype=dtype, device='cuda')
+    residual = torch.rand((seq_length, args.micro_batch_size, args.hidden_size),
+                          dtype=dtype, device='cuda')
+    bias = torch.rand((args.hidden_size), dtype=dtype, device='cuda').expand_as(residual)
+    dropout_rate = 0.1
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for input_grad, bias_grad, residual_grad in zip([False, True], [True, True], [True, True]):
+        input.requires_grad = input_grad
+        bias.requires_grad = bias_grad
+        residual.requires_grad = residual_grad
+        for _ in range(5):
+            output = bias_dropout_add_fused_train(input, bias, residual, dropout_rate)
+    del bias, input, residual, output
+    torch.cuda.empty_cache()
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -78,7 +78,12 @@ class BertLMHead(MegatronModule):
        self.parallel_output = parallel_output

        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
-        self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
+        setattr(self.dense.weight, 'sequence_parallel', args.sequence_parallel)
+        setattr(self.dense.bias, 'sequence_parallel', args.sequence_parallel)
+
+        self.layernorm = LayerNorm(hidden_size,
+                                   eps=layernorm_epsilon,
+                                   sequence_parallel=args.sequence_parallel)
        self.gelu = torch.nn.functional.gelu
        if args.openai_gelu:
            self.gelu = openai_gelu
@@ -110,14 +115,20 @@ def post_language_model_processing(lm_output, pooled_output,
        binary_logits = binary_head(pooled_output)

    if lm_labels is None:
-        return lm_logits, binary_logits
+        # [s b h] => [b s h]
+        return lm_logits.transpose(0,1).contiguous(), binary_logits
    else:
+        # [b s] => [s b]
+        lm_labels = lm_labels.transpose(0,1).contiguous()
+        # lm_logits : [s, b, h] and lm_labels: [s, b]
        if fp16_lm_cross_entropy:
            assert lm_logits.dtype == torch.half
            lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits, lm_labels)
        else:
            lm_loss = mpu.vocab_parallel_cross_entropy(lm_logits.float(),
                                                       lm_labels)
+        # [s, b] => [b s]
+        lm_loss = lm_loss.transpose(0,1).contiguous()
        return lm_loss, binary_logits